Beispiel #1
0
    def scrape_bill(self, row, chamber, session):
        bill_id = row['LegislationNumber']

        # TODO: re-evaluate if these should be separate bills
        if 'SA' in bill_id or 'HA' in bill_id:
            self.warning('skipping amendment %s', bill_id)
            return

        bill_type = self.classify_bill(bill_id)
        bill = Bill(identifier=bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=row['LongTitle'],
                    classification=bill_type)
        if row['Synopsis']:
            bill.add_abstract(row['Synopsis'], 'synopsis')
        if row['ShortTitle']:
            bill.add_title(row['ShortTitle'], 'short title')
        if row['SponsorPersonId']:
            self.add_sponsor_by_legislator_id(bill, row['SponsorPersonId'], 'primary')

        # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API?
        html_url = 'https://legis.delaware.gov/BillDetail?LegislationId={}'.format(
            row['LegislationId']
        )
        bill.add_source(html_url, note='text/html')

        html = self.lxmlize(html_url)

        # Additional Sponsors: '//label[text()="Additional Sponsor(s):"]/following-sibling::div/a'
        additional_sponsors = html.xpath('//label[text()="Additional Sponsor(s):"]'
                                         '/following-sibling::div/a/@href')
        for sponsor_url in additional_sponsors:
            sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?'
                                             'personId=', '')
            self.add_sponsor_by_legislator_id(bill, sponsor_id, 'primary')

        # CoSponsors: '//label[text()="Co-Sponsor(s):"]/following-sibling::div/a'
        cosponsors = html.xpath('//label[text()="Additional Sponsor(s):"]/'
                                'following-sibling::div/a/@href')
        for sponsor_url in cosponsors:
            sponsor_id = sponsor_url.replace('https://legis.delaware.gov/LegislatorDetail?'
                                             'personId=', '')
            self.add_sponsor_by_legislator_id(bill, sponsor_id, 'cosponsor')

        versions = html.xpath('//label[text()="Original Text:"]/following-sibling::div/a/@href')
        for version_url in versions:
            media_type = self.mime_from_link(version_url)
            version_name = 'Bill Text'
            # on_duplicate='error'
            bill.add_version_link(version_name, version_url, media_type=media_type)

        fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href')
        for fiscal in fiscals:
            self.scrape_fiscal_note(bill, fiscal)

        self.scrape_actions(bill, row['LegislationId'])
        yield from self.scrape_votes(bill, row['LegislationId'], session)

        yield bill
Beispiel #2
0
    def scrape_table(self, page):
        for tr in page.xpath('//table[@id="ContentPlaceHolder1_GR1"]/tr')[2:]:
            print(lxml.etree.tostring(tr))
            print("\n")
            year = tr.xpath('td[1]/text()')[0]
            bill_no = tr.xpath('td[2]/text()')[0]
            title = tr.xpath('td[3]/a[1]/text()')[0].strip()
            versions = tr.xpath('td[3]/a')[1:]
            bill = Bill(identifier=bill_no,
                        legislative_session=year,
                        title=title,
                        classification="resolution")

            for version in versions:
                url = version.xpath('@href')[0]
                mime = mimetypes.guess_type(url)[0]

                bill.add_version_link(version.text_content().strip(),
                                      url,
                                      media_type=mime)

            if tr.xpath('string(td[6])'):
                intro_date_str = tr.xpath('string(td[6])')
                regex = re.compile(
                    r'\s+(?P<date>\d{2}/\d{2}/\d{4})\s+\((?P<chamber>Lok Sabha|Rajya Sabha\))'
                )
                for match in regex.finditer(intro_date_str):
                    print(match.groupdict())
                    if match.group('chamber') == 'Lok Sabha':
                        action_chamber = 'lower'
                    elif match.group('chamber') == 'Rajya Sabha':
                        action_chamber = 'upper'

                    intro_date = datetime.strptime(match.group('date'),
                                                   '%d/%m/%Y')
                    intro_date = self.tz.localize(intro_date)

                    bill.add_action(
                        "Introduced in {}".format(match.group(
                            'chamber')),  # Action description, from the state
                        intro_date,
                        chamber=action_chamber,
                        classification='introduction')

            if tr.xpath('string(td[7])'):
                passed_lower = tr.xpath('string(td[7])')
                regex = r'\s+(?P<date>\d{2}/\d{2}/\d{4})\s+'
                print("PL Date:")
                print(passed_lower)

            if tr.xpath('string(td[8])'):
                passed_upper = tr.xpath('string(td[8])')
                regex = r'\s+(?P<date>\d{2}/\d{2}/\d{4})\s+'
                print("PU Date:")
                print(passed_upper)

            bill.add_source(self.base_url)
            #print( tr.xpath('td[3]/a[3]/text()'))
            print(bill)
            yield bill
Beispiel #3
0
    def scrape_bill(self, session, bill_id, chamber):
        # https://malegislature.gov/Bills/189/SD2739
        session_for_url = self.replace_non_digits(session)
        bill_url = 'https://malegislature.gov/Bills/{}/{}'.format(session_for_url, bill_id)

        try:
            response = requests.get(bill_url)
        except requests.exceptions.RequestException as e:
            self.warning(u'Server Error on {}'.format(bill_url))
            return False

        html = response.text

        page = lxml.html.fromstring(html)

        if not page.xpath('//div[contains(@class, "followable")]/h1/text()'):
            self.warning(u'Server Error on {}'.format(bill_url))
            return False

        bill_title = page.xpath('//div[@id="contentContainer"]/div/div/h2/text()')[0]

        bill_id = re.sub(r'[^S|H|D|\d]', '', bill_id)

        bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                    title=bill_title, classification='bill')

        bill_summary = None
        if page.xpath('//p[@id="pinslip"]/text()'):
            bill_summary = page.xpath('//p[@id="pinslip"]/text()')[0]
        if bill_summary:
            bill.add_abstract(bill_summary, 'summary')

        bill.add_source(bill_url)

        # https://malegislature.gov/Bills/189/SD2739 has a presenter
        # https://malegislature.gov/Bills/189/S2168 no sponsor
        # Find the non-blank text of the dt following Sponsor or Presenter,
        # including any child link text.
        sponsor = page.xpath(
            '//dt[text()="Sponsor:" or text()="Presenter:"]/'
            'following-sibling::dd/descendant-or-self::*/text()[normalize-space()]')
        if sponsor:
            sponsor = sponsor[0].strip()
            bill.add_sponsorship(sponsor, classification='primary', primary=True,
                                 entity_type='person')

        self.scrape_cosponsors(bill, bill_url)

        version = page.xpath("//div[contains(@class, 'modalBtnGroup')]/"
                             "a[contains(text(), 'Download PDF') and not(@disabled)]/@href")
        if version:
            version_url = "https://malegislature.gov{}".format(version[0])
            bill.add_version_link('Bill Text', version_url, media_type='application/pdf')

        # yield back votes and bill
        yield from self.scrape_actions(bill, bill_url, session)
        yield bill
Beispiel #4
0
    def scrape_bills(self, session):
        session_key = SESSION_KEYS[session]
        measures_response = self.api_client.get('measures', page=500, session=session_key)

        legislators = index_legislators(self, session_key)

        for measure in measures_response:
            bid = '{} {}'.format(measure['MeasurePrefix'], measure['MeasureNumber'])

            chamber = self.chamber_code[bid[0]]
            bill = Bill(
                bid.replace(' ', ''),
                legislative_session=session,
                chamber=chamber,
                title=measure['RelatingTo'],
                classification=self.bill_types[measure['MeasurePrefix'][1:]]
            )
            bill.add_abstract(measure['MeasureSummary'].strip(), note='summary')

            for sponsor in measure['MeasureSponsors']:
                legislator_code = sponsor['LegislatoreCode']  # typo in API
                if legislator_code:
                    try:
                        legislator = legislators[legislator_code]
                    except KeyError:
                        logger.warn('Legislator {} not found in session {}'.format(
                            legislator_code, session))
                        legislator = legislator_code
                    bill.add_sponsorship(
                        name=legislator,
                        classification={'Chief': 'primary', 'Regular': 'cosponsor'}[
                            sponsor['SponsorLevel']],
                        entity_type='person',
                        primary=True if sponsor['SponsorLevel'] == 'Chief' else False
                    )

            bill.add_source(
                "https://olis.leg.state.or.us/liz/{session}/Measures/Overview/{bid}".format(
                    session=session_key, bid=bid.replace(' ', ''))
            )
            for document in measure['MeasureDocuments']:
                # TODO: probably mixing documents & versions here - should revisit
                try:
                    bill.add_version_link(document['VersionDescription'], document['DocumentUrl'],
                                          media_type='application/pdf')
                except ValueError:
                    logger.warn('Duplicate link found for {}'.format(document['DocumentUrl']))
            for action in measure['MeasureHistoryActions']:
                classifiers = self.determine_action_classifiers(action['ActionText'])
                when = datetime.datetime.strptime(action['ActionDate'], '%Y-%m-%dT%H:%M:%S')
                when = self.tz.localize(when)
                bill.add_action(action['ActionText'], when,
                                chamber=self.chamber_code[action['Chamber']],
                                classification=classifiers)

            yield bill
Beispiel #5
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt"
        page = csv.reader(get_utf_16_ftp_content(url).splitlines(),
                          delimiter="|")

        for row in page:
            bill_chamber = {"H": "lower", "S": "upper"}[row[0]]

            if bill_chamber != chamber:
                continue
            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r"(H|S)([A-Z]+)\s", bill_id).group(2)
            bill_type = {
                "B": "bill",
                "R": "resolution",
                "JR": "joint resolution",
                "CR": "concurrent resolution",
                "MR": "memorial",
                "CMR": "concurrent memorial",
            }[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(
                bill_id,
                legislative_session=session,
                chamber=chamber,
                title=row[3],
                classification=bill_type,
            )
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]

            if primary:
                bill.add_sponsorship(
                    primary,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/Searchable/%s.pdf" %
                           (self.slug, bill_id.replace(" ", "")))
            bill.add_version_link(bill_id,
                                  version_url,
                                  media_type="application/pdf")

            yield from self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
Beispiel #6
0
    def scrape_bills(self, session):
        session_key = SESSION_KEYS[session]
        measures_response = self.api_client.get('measures', page=500, session=session_key)

        legislators = index_legislators(self, session_key)

        for measure in measures_response:
            bid = '{} {}'.format(measure['MeasurePrefix'], measure['MeasureNumber'])

            chamber = self.chamber_code[bid[0]]
            bill = Bill(
                bid.replace(' ', ''),
                legislative_session=session,
                chamber=chamber,
                title=measure['RelatingTo'],
                classification=self.bill_types[measure['MeasurePrefix'][1:]]
            )
            bill.add_abstract(measure['MeasureSummary'].strip(), note='summary')

            for sponsor in measure['MeasureSponsors']:
                legislator_code = sponsor['LegislatoreCode']  # typo in API
                if legislator_code:
                    try:
                        legislator = legislators[legislator_code]
                    except KeyError:
                        logger.warn('Legislator {} not found in session {}'.format(
                            legislator_code, session))
                        legislator = legislator_code
                    bill.add_sponsorship(
                        name=legislator,
                        classification={'Chief': 'primary', 'Regular': 'cosponsor'}[
                            sponsor['SponsorLevel']],
                        entity_type='person',
                        primary=True if sponsor['SponsorLevel'] == 'Chief' else False
                    )

            bill.add_source(
                "https://olis.leg.state.or.us/liz/{session}/Measures/Overview/{bid}".format(
                    session=session_key, bid=bid.replace(' ', ''))
            )
            for document in measure['MeasureDocuments']:
                # TODO: probably mixing documents & versions here - should revisit
                try:
                    bill.add_version_link(document['VersionDescription'], document['DocumentUrl'],
                                          media_type='application/pdf')
                except ValueError:
                    logger.warn('Duplicate link found for {}'.format(document['DocumentUrl']))
            for action in measure['MeasureHistoryActions']:
                classifiers = self.determine_action_classifiers(action['ActionText'])
                when = datetime.datetime.strptime(action['ActionDate'], '%Y-%m-%dT%H:%M:%S')
                when = self.tz.localize(when)
                bill.add_action(action['ActionText'], when,
                                chamber=self.chamber_code[action['Chamber']],
                                classification=classifiers)

            yield bill
Beispiel #7
0
    def scrape_bill(self, chamber, session, bill_id):
        bill_num = bill_id.split()[1]

        url = ("%s/GetLegislation?biennium=%s&billNumber"
               "=%s" % (self._base_url, self.biennium, bill_num))

        page = self.get(url)
        page = lxml.etree.fromstring(page.content)
        page = xpath(page, "//wa:Legislation")[0]

        xml_chamber = xpath(page, 'string(wa:OriginalAgency)')
        chamber = self._chamber_map[xml_chamber]

        title = xpath(page, "string(wa:LongDescription)")

        bill_type = xpath(
            page, "string(wa:ShortLegislationType/wa:LongLegislationType)")
        bill_type = bill_type.lower()

        if bill_type == 'gubernatorial appointment':
            return

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=[bill_type])
        fake_source = ("http://apps.leg.wa.gov/billinfo/"
                       "summary.aspx?bill=%s&year=%s" %
                       (bill_num, session[0:4]))

        bill.add_source(fake_source)

        try:
            for version in self.versions[bill_id]:
                bill.add_version_link(note=version['note'],
                                      url=version['url'],
                                      media_type=version['media_type'])
        except KeyError:
            self.warning("No versions were found for {}".format(bill_id))

        try:
            for document in self.documents[bill_num]:
                bill.add_document_link(note=document['note'],
                                       url=document['url'],
                                       media_type=document['media_type'])
        except KeyError:
            pass

        self.scrape_sponsors(bill)
        self.scrape_actions(bill, bill_num)
        self.scrape_hearings(bill, bill_num)
        yield from self.scrape_votes(bill)
        bill.subject = list(set(self._subjects[bill_id]))
        yield bill
Beispiel #8
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt"
        page = self.get(url).text
        page = unicode_csv_reader(StringIO(page), delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]

            if bill_chamber != chamber:
                continue
            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial',
                'CMR': 'concurrent memorial'
            }[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(bill_id,
                        legislative_session=session,
                        chamber=chamber,
                        title=row[3],
                        classification=bill_type)
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]

            if primary:
                bill.add_sponsorship(primary,
                                     classification='primary',
                                     entity_type='person',
                                     primary=True)
            # ftp://www.arkleg.state.ar.us/Bills/
            # TODO: Keep on eye on this post 2017 to see if they apply R going forward.
            session_code = '2017R' if session == '2017' else session

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/%s.pdf" %
                           (session_code, bill_id.replace(' ', '')))
            bill.add_version_link(bill_id,
                                  version_url,
                                  media_type='application/pdf')

            yield from self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
Beispiel #9
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt"
        page = csv.reader(get_utf_16_ftp_content(url).splitlines(),
                          delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]

            if bill_chamber != chamber:
                continue
            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial',
                'CMR': 'concurrent memorial'
            }[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(bill_id,
                        legislative_session=session,
                        chamber=chamber,
                        title=row[3],
                        classification=bill_type)
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]

            if primary:
                bill.add_sponsorship(primary,
                                     classification='primary',
                                     entity_type='person',
                                     primary=True)

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/Searchable/%s.pdf" %
                           (self.slug, bill_id.replace(' ', '')))
            bill.add_version_link(bill_id,
                                  version_url,
                                  media_type='application/pdf')

            yield from self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
Beispiel #10
0
    def scrape_bill(self, chamber, session, bill_id):
        bill_num = bill_id.split()[1]

        url = ("%s/GetLegislation?biennium=%s&billNumber"
               "=%s" % (self._base_url, self.biennium, bill_num))

        page = self.get(url)
        page = lxml.etree.fromstring(page.content)
        page = xpath(page, "//wa:Legislation")[0]

        title = xpath(page, "string(wa:LongDescription)")

        bill_type = xpath(
            page,
            "string(wa:ShortLegislationType/wa:LongLegislationType)")
        bill_type = bill_type.lower()

        if bill_type == 'gubernatorial appointment':
            return

        bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                    title=title, classification=[bill_type])
        fake_source = ("http://apps.leg.wa.gov/billinfo/"
                       "summary.aspx?bill=%s&year=%s" % (
                           bill_num, session[0:4]))

        bill.add_source(fake_source)

        try:
            for version in self.versions[bill_id]:
                bill.add_version_link(note=version['note'],
                                      url=version['url'],
                                      media_type=version['media_type'])
        except KeyError:
            self.warning("No versions were found for {}".format(bill_id))

        try:
            for document in self.documents[bill_num]:
                bill.add_document_link(note=document['note'],
                                       url=document['url'],
                                       media_type=document['media_type'])
        except KeyError:
            pass

        self.scrape_sponsors(bill)
        self.scrape_actions(bill, bill_num)
        self.scrape_hearings(bill, bill_num)
        yield from self.scrape_votes(bill)
        bill.subject = list(set(self._subjects[bill_id]))
        yield bill
Beispiel #11
0
    def scrape(self):
        self.session = '2011'

        for i, page in enumerate(self.searchLegislation()):
            for legislation_summary in self.parseSearchResults(page):
                title = legislation_summary['Title'].strip()
                if title == "":
                    continue

                bill = Bill(name=legislation_summary['Record #'],
                            session=self.session,
                            title=title,
                            type=[legislation_summary['Type'].lower()],
                            organization=self.jurisdiction.name)

                bill.add_source(legislation_summary['URL'])

                legislation_details = self.expandLegislationSummary(
                    legislation_summary)

                for related_bill in legislation_details.get(
                        'Related files', []):
                    bill.add_related_bill(name=related_bill,
                                          session=self.session,
                                          relation='other-session',
                                          chamber=None)

                for i, sponsor in enumerate(
                        legislation_details.get('Sponsors', [])):
                    if i == 0:
                        primary = True
                        sponsorship_type = "Primary"
                    else:
                        primary = False
                        sponsorship_type = "Regular"

                    bill.add_sponsor(sponsor, sponsorship_type, 'person',
                                     primary)

                for subject in legislation_details.get(u'Topics', []):
                    bill.add_subject(subject)

                for attachment in legislation_details.get(u'Attachments', []):
                    bill.add_version_link('PDF',
                                          attachment['url'],
                                          mimetype="application/pdf")

                yield bill
Beispiel #12
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt"
        page = self.get(url).text
        page = unicode_csv_reader(StringIO(page), delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]

            if bill_chamber != chamber:
                continue
            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial',
                'CMR': 'concurrent memorial'}[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(bill_id, legislative_session=session,
                        chamber=chamber, title=row[3], classification=bill_type)
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]

            if primary:
                bill.add_sponsorship(primary, classification='primary',
                                     entity_type='person', primary=True)
            # ftp://www.arkleg.state.ar.us/Bills/
            # TODO: Keep on eye on this post 2017 to see if they apply R going forward.
            session_code = '2017R' if session == '2017' else session

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/%s.pdf" % (
                               session_code, bill_id.replace(' ', '')))
            bill.add_version_link(bill_id, version_url, media_type='application/pdf')

            yield from self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
    def scrape(self):
        self.session = '2011'

        for i, page in enumerate(self.searchLegislation()) :
            for legislation_summary in self.parseSearchResults(page) :
                title = legislation_summary['Title'].strip()
                if title == "":
                    continue

                bill = Bill(name=legislation_summary['Record #'],
                            session=self.session,
                            title=title,
                            type=[legislation_summary['Type'].lower()],
                            organization=self.jurisdiction.name)

                bill.add_source(legislation_summary['URL'])

                legislation_details = self.expandLegislationSummary(legislation_summary)

                for related_bill in legislation_details.get('Related files', []) :
                    bill.add_related_bill(name = related_bill,
                                          session = self.session,
                                          relation='other-session',
                                          chamber=None)

                for i, sponsor in enumerate(legislation_details.get('Sponsors', [])) :
                    if i == 0 :
                        primary = True
                        sponsorship_type = "Primary"
                    else :
                        primary = False
                        sponsorship_type = "Regular"

                    bill.add_sponsor(sponsor, sponsorship_type,
                                     'person', primary)

                for subject in legislation_details.get(u'Topics', []) :
                    bill.add_subject(subject)

                for attachment in legislation_details.get(u'Attachments', []) :
                    bill.add_version_link('PDF',
                                          attachment['url'],
                                          mimetype="application/pdf")


                yield bill
Beispiel #14
0
    def scrape_chamber(self, chamber, session):
        chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber]

        url = ("http://legisweb.state.wy.us/%s/billreference/"
               "BillReference.aspx?type=%s" % (session, chamber_abbrev))
        page = self.lxmlize(url)

        for tr in page.xpath(
                "//table[contains(@id,'cphContent_gvBills')]//tr")[1:]:
            bill_id = tr.xpath("string(td[1])").strip()
            title = tr.xpath("string(td[2])").strip()

            if bill_id[0:2] in ['SJ', 'HJ']:
                bill_type = 'joint resolution'
            else:
                bill_type = 'bill'

            bill = Bill(bill_id,
                        legislative_session=session,
                        title=title,
                        chamber=chamber,
                        classification=bill_type)

            yield from self.scrape_digest(bill, chamber)

            # versions
            for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') +
                      tr.xpath('td[12]//a')):
                # skip references to other bills
                if a.text.startswith('See'):
                    continue
                bill.add_version_link(a.text,
                                      a.get('href'),
                                      media_type='application/pdf')

            # documents
            fnote = tr.xpath('td[9]//a')
            if fnote:
                bill.add_document_link('Fiscal Note', fnote[0].get('href'))
            summary = tr.xpath('td[14]//a')
            if summary:
                bill.add_document_link('Summary', summary[0].get('href'))

            bill.add_source(url)
            yield bill
Beispiel #15
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/SessionInformation/LegislativeMeasures.txt"
        page = csv.reader(get_utf_16_ftp_content(url).splitlines(), delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]

            if bill_chamber != chamber:
                continue
            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial',
                'CMR': 'concurrent memorial'}[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(bill_id, legislative_session=session,
                        chamber=chamber, title=row[3], classification=bill_type)
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]

            if primary:
                bill.add_sponsorship(primary, classification='primary',
                                     entity_type='person', primary=True)

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/Searchable/%s.pdf" % (
                               self.slug, bill_id.replace(' ', '')))
            bill.add_version_link(bill_id, version_url, media_type='application/pdf')

            yield from self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
Beispiel #16
0
    def scrape_chamber(self, chamber, session):
        chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber]

        url = ("http://legisweb.state.wy.us/%s/billreference/"
               "BillReference.aspx?type=%s" % (session, chamber_abbrev))
        page = self.lxmlize(url)

        for tr in page.xpath("//table[contains(@id,'cphContent_gvBills')]//tr")[1:]:
            bill_id = tr.xpath("string(td[1])").strip()
            title = tr.xpath("string(td[2])").strip()

            if bill_id[0:2] in ['SJ', 'HJ']:
                bill_type = 'joint resolution'
            else:
                bill_type = 'bill'

            bill = Bill(bill_id, legislative_session=session, title=title, chamber=chamber,
                        classification=bill_type)

            yield from self.scrape_digest(bill, chamber)

            # versions
            for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') +
                      tr.xpath('td[12]//a')):
                # skip references to other bills
                if a.text.startswith('See'):
                    continue
                bill.add_version_link(a.text, a.get('href'),
                                      media_type='application/pdf')

            # documents
            fnote = tr.xpath('td[9]//a')
            if fnote:
                bill.add_document_link('Fiscal Note', fnote[0].get('href'))
            summary = tr.xpath('td[14]//a')
            if summary:
                bill.add_document_link('Summary', summary[0].get('href'))

            bill.add_source(url)
            yield bill
Beispiel #17
0
    def bill_info(self, bill_link, session, main_url):
        bill_page = self.lxmlize(bill_link)

        long_title = self.get_node(
            bill_page,
            '//div[@class="main-content"]/div[1]/div/h2').text.split()

        bill_number = long_title[0]
        title = ''
        for x in range(2, len(long_title)):
            title += long_title[x] + ' '
        title = title[0:-1]

        if not title:
            self.error('no title, skipping %s', bill_number)
            return

        bill_type = 'resolution' if 'LR' in bill_number else 'bill'

        bill = Bill(bill_number, session, title, classification=bill_type)

        bill.add_source(main_url)
        bill.add_source(bill_link)

        introduced_by = self.get_node(
            bill_page,
            '//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/a[1]/text()')

        if not introduced_by:
            introduced_by = self.get_node(
                bill_page,
                '//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/text()')
            introduced_by = introduced_by.split('Introduced By:')[1].strip()

        bill.add_sponsorship(
            name=introduced_by,
            entity_type='person',
            primary=True,
            classification='primary',
        )

        action_nodes = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[5]//table/tbody/tr')

        for action_node in action_nodes:
            date = self.get_node(
                action_node,
                './td[1]').text
            date = datetime.strptime(date, '%b %d, %Y')

            # The action node may have an anchor element within it, so
            # we grab all the text within.
            action = self.get_node(
                action_node,
                './td[2]').text_content()

            if 'Governor' in action:
                actor = 'executive'
            elif 'Speaker' in action:
                actor = 'legislature'
            else:
                actor = 'legislature'

            action_type = self.action_types(action)
            bill.add_action(
                action,
                date.strftime('%Y-%m-%d'),
                chamber=actor,
                classification=action_type,
            )

        # Were in reverse chronological order.
        bill.actions.reverse()

        # Grabs bill version documents.
        version_links = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[3]/div[2]/'
            'div[@class="hidden-xs"]/ul[1]/li/a')

        for version_link in version_links:
            version_name = version_link.text
            version_url = version_link.attrib['href']
            # replace Current w/ session number
            version_url = version_url.replace('Current', session)
            bill.add_version_link(version_name, version_url, media_type='application/pdf')

        # Adds any documents related to amendments.
        amendment_links = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[5]/div[2]/table/tr/td[1]/a')

        for amendment_link in amendment_links:
            amendment_name = amendment_link.text
            amendment_url = amendment_link.attrib['href']
            bill.add_document_link(amendment_name, amendment_url)

        # Related transcripts.
        transcript_links = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[5]/div[2]/'
            'div[@class="hidden-xs"]/table/tr/td/a')

        for transcript_link in transcript_links:
            transcript_name = transcript_link.text
            transcript_url = transcript_link.attrib['href']
            bill.add_document_link(transcript_name, transcript_url)

        yield bill

        yield from self.scrape_votes(bill, bill_page, actor)
Beispiel #18
0
    def scrape_bill(self, chamber, session, bill_id, title, url):
        page = self.lxmlize(url)

        if re.match(r'^(S|H)B ', bill_id):
            btype = ['bill']
        elif re.match(r'(S|H)C ', bill_id):
            btype = ['commemoration']
        elif re.match(r'(S|H)JR ', bill_id):
            btype = ['joint resolution']
        elif re.match(r'(S|H)CR ', bill_id):
            btype = ['concurrent resolution']
        else:
            btype = ['bill']

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=btype)
        bill.add_source(url)

        regex_ns = "http://exslt.org/regular-expressions"
        version_links = page.xpath(
            r"//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]",
            namespaces={'re': regex_ns})
        for link in version_links:
            bill.add_version_link(link.xpath('string()').strip(),
                                  link.attrib['href'],
                                  media_type='text/html',
                                  on_duplicate='ignore')

        sponsor_links = page.xpath(
            '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillDetail"]' +
            '/label[contains(text(), "Sponsors:")]' +
            '/following-sibling::div[1]/p/a')
        for link in sponsor_links:
            if link.attrib['href'].startswith(
                    'https://sdlegislature.gov/Legislators/'):
                sponsor_type = 'person'
            elif link.attrib['href'].startswith(
                    'https://sdlegislature.gov/Legislative_Session/Committees'
            ):
                sponsor_type = 'organization'
            else:
                raise ScrapeError('Found unexpected sponsor, URL: ' +
                                  link.attrib['href'])
            bill.add_sponsorship(link.text,
                                 classification='primary',
                                 primary=True,
                                 entity_type=sponsor_type)

        actor = chamber
        use_row = False

        for row in page.xpath("//table[contains(@id, 'tblBillActions')]//tr"):
            # Some tables have null rows, that are just `<tr></tr>`
            # Eg: sdlegislature.gov/Legislative_Session/Bills/Bill.aspx?Bill=1005&Session=2018
            if row.text_content() == '':
                self.debug(
                    'Skipping action table row that is completely empty')
                continue

            if 'Date' in row.text_content() and 'Action' in row.text_content():
                use_row = True
                continue
            elif not use_row:
                continue

            action = row.xpath("string(td[2])").strip()

            atypes = []
            if action.startswith('First read'):
                atypes.append('introduction')
                atypes.append('reading-1')

            if re.match(r'Signed by (?:the\s)*Governor', action,
                        re.IGNORECASE):
                atypes.append('executive-signature')
                actor = 'executive'

            match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)',
                             action)
            if match:
                if match.group(1) in ['Senate', 'House of Representatives']:
                    first = ''
                else:
                    first = 'committee-'
                if match.group(3).lower() == 'passed':
                    second = 'passage'
                elif match.group(3).lower() == 'failed':
                    second = 'failure'
                atypes.append("%s%s" % (first, second))

            if 'referred to' in action.lower():
                atypes.append('referral-committee')

            if 'Motion to amend, Passed Amendment' in action:
                atypes.append('amendment-introduction')
                atypes.append('amendment-passage')

            if 'Veto override, Passed' in action:
                atypes.append('veto-override-passage')
            elif 'Veto override, Failed' in action:
                atypes.append('veto-override-failure')

            if 'Delivered to the Governor' in action:
                atypes.append('executive-receipt')

            match = re.match("First read in (Senate|House)", action)
            if match:
                if match.group(1) == 'Senate':
                    actor = 'upper'
                else:
                    actor = 'lower'

            date = row.xpath("string(td[1])").strip()
            match = re.match(r'\d{2}/\d{2}/\d{4}', date)
            if not match:
                self.warning("Bad date: %s" % date)
                continue
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"):
                yield from self.scrape_vote(bill, date, link.attrib['href'])

            bill.add_action(action, date, chamber=actor, classification=atypes)

        for link in page.xpath("//a[contains(@href, 'Keyword')]"):
            bill.add_subject(link.text.strip())

        yield bill
Beispiel #19
0
    def bill_info(self, bill_link, session, main_url):
        bill_page = self.lxmlize(bill_link)

        long_title = self.get_node(
            bill_page,
            '//div[@class="main-content"]/div[1]/div/h2').text.split()

        bill_number = long_title[0]
        title = ''
        for x in range(2, len(long_title)):
            title += long_title[x] + ' '
        title = title[0:-1]

        if not title:
            self.error('no title, skipping %s', bill_number)
            return

        bill_type = 'resolution' if 'LR' in bill_number else 'bill'

        bill = Bill(bill_number, session, title, classification=bill_type)

        bill.add_source(main_url)
        bill.add_source(bill_link)

        introduced_by = self.get_node(
            bill_page,
            '//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/a[1]/text()')

        if not introduced_by:
            introduced_by = self.get_node(
                bill_page,
                '//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/text()')
            introduced_by = introduced_by.split('Introduced By:')[1].strip()

        bill.add_sponsorship(
            name=introduced_by,
            entity_type='person',
            primary=True,
            classification='primary',
        )

        action_nodes = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[5]//table/tbody/tr')

        for action_node in action_nodes:
            date = self.get_node(
                action_node,
                './td[1]').text
            date = datetime.strptime(date, '%b %d, %Y')

            # The action node may have an anchor element within it, so
            # we grab all the text within.
            action = self.get_node(
                action_node,
                './td[2]').text_content()

            if 'Governor' in action:
                actor = 'executive'
            elif 'Speaker' in action:
                actor = 'legislature'
            else:
                actor = 'legislature'

            action_type = self.action_types(action)
            bill.add_action(
                action,
                date.strftime('%Y-%m-%d'),
                chamber=actor,
                classification=action_type,
            )

        # Were in reverse chronological order.
        bill.actions.reverse()

        # Grabs bill version documents.
        version_links = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[3]/div[2]/'
            'div[@class="hidden-xs"]/ul[1]/li/a')

        for version_link in version_links:
            version_name = version_link.text
            version_url = version_link.attrib['href']
            # replace Current w/ session number
            version_url = version_url.replace('Current', session)
            bill.add_version_link(version_name, version_url, media_type='application/pdf')

        # Adds any documents related to amendments.
        amendment_links = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[5]/div[2]/table/tr/td[1]/a')

        for amendment_link in amendment_links:
            amendment_name = amendment_link.text
            amendment_url = amendment_link.attrib['href']
            bill.add_document_link(amendment_name, amendment_url)

        # Related transcripts.
        transcript_links = self.get_nodes(
            bill_page,
            '//div[@class="main-content"]/div[5]/div[2]/'
            'div[@class="hidden-xs"]/table/tr/td/a')

        for transcript_link in transcript_links:
            transcript_name = transcript_link.text
            transcript_url = transcript_link.attrib['href']
            bill.add_document_link(transcript_name, transcript_url)

        return bill
    def scrape_details(self, bill_detail_url, session, chamber, bill_id):
        """
        Create the Bill and add the information obtained from the provided bill_detail_url.
        and then yield the bill object.
        :param bill_detail_url:
        :param session:
        :param chamber:
        :param bill_id:
        :return:
        """
        page = self.get(bill_detail_url).text

        if 'INVALID BILL NUMBER' in page:
            self.warning('INVALID BILL %s' % bill_detail_url)
            return

        doc = lxml.html.fromstring(page)
        doc.make_links_absolute(bill_detail_url)

        bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0]

        bill_type = bill_div.xpath('span/text()')[0]

        if 'General Bill' in bill_type:
            bill_type = 'bill'
        elif 'Concurrent Resolution' in bill_type:
            bill_type = 'concurrent resolution'
        elif 'Joint Resolution' in bill_type:
            bill_type = 'joint resolution'
        elif 'Resolution' in bill_type:
            bill_type = 'resolution'
        else:
            raise ValueError('unknown bill type: %s' % bill_type)

        # this is fragile, but less fragile than it was
        b = bill_div.xpath('./b[text()="Summary:"]')[0]
        bill_summary = b.getnext().tail.strip()

        bill = Bill(
            bill_id,
            legislative_session=
            session,  # session name metadata's `legislative_sessions`
            chamber=chamber,  # 'upper' or 'lower'
            title=bill_summary,
            classification=bill_type)

        subjects = list(self._subjects[bill_id])

        for subject in subjects:
            bill.add_subject(subject)

        # sponsors
        for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'):
            bill.add_sponsorship(name=sponsor,
                                 classification='primary',
                                 primary=True,
                                 entity_type='person')
        for sponsor in doc.xpath(
                '//a[contains(@href, "committee.php")]/text()'):
            sponsor = sponsor.replace(u'\xa0', ' ').strip()
            bill.add_sponsorship(name=sponsor,
                                 classification='primary',
                                 primary=True,
                                 entity_type='organization')

        # find versions
        version_url = doc.xpath('//a[text()="View full text"]/@href')[0]
        version_html = self.get(version_url).text
        version_doc = lxml.html.fromstring(version_html)
        version_doc.make_links_absolute(version_url)
        for version in version_doc.xpath('//a[contains(@href, "/prever/")]'):
            # duplicate versions with same date, use first appearance

            bill.add_version_link(
                note=version.
                text,  # Description of the version from the state;
                #  eg, 'As introduced', 'Amended', etc.
                url=version.get('href'),
                on_duplicate='ignore',
                media_type='text/html'  # Still a MIME type
            )

        # actions
        for row in bill_div.xpath('table/tr'):
            date_td, chamber_td, action_td = row.xpath('td')

            date = datetime.datetime.strptime(date_td.text, "%m/%d/%y")
            action_chamber = {
                'Senate': 'upper',
                'House': 'lower',
                None: 'legislature'
            }[chamber_td.text]

            action = action_td.text_content()
            action = action.split('(House Journal')[0]
            action = action.split('(Senate Journal')[0].strip()

            atype = action_type(action)

            bill.add_action(
                description=action,  # Action description, from the state
                date=date.strftime('%Y-%m-%d'),  # `YYYY-MM-DD` format
                chamber=action_chamber,  # 'upper' or 'lower'
                classification=atype  # Options explained in the next section
            )

        # votes
        vurl = doc.xpath('//a[text()="View Vote History"]/@href')
        if vurl:
            vurl = vurl[0]
            yield from self.scrape_vote_history(bill, vurl)

        bill.add_source(bill_detail_url)
        yield bill
Beispiel #21
0
    def scrape(self, session=None, chambers=None):
        # Bills endpoint can sometimes take a very long time to load
        self.timeout = 300

        if not session:
            session = self.latest_session()
            self.info('no session, using %s', session)

        if int(session) < 128:
            raise AssertionError("No data for period {}".format(session))

        elif int(session) < 131:
            # they changed their data format starting in 131st and added
            # an undocumented API
            yield from self.old_scrape(session)

        else:
            chamber_dict = {"Senate": "upper", "House": "lower",
                            "House of Representatives": "lower",
                            "house": "lower", "senate": "upper"}

            # so presumanbly not everything passes, but we haven't
            # seen anything not pass yet, so we'll need to wait
            # till it fails and get the right language in here
            vote_results = {"approved": True,
                            "passed": True,
                            "adopted": True,
                            "true": True,
                            "false": False,
                            "failed": False,
                            True: True,
                            False: False}

            action_dict = {"ref_ctte_100": "referral-committee",
                           "intro_100": "introduction",
                           "intro_101": "introduction",
                           "pass_300": "passage",
                           "intro_110": "reading-1",
                           "refer_210": "referral-committee",
                           "crpt_301": None,
                           "crpt_317": None,
                           "concur_606": "passage",
                           "pass_301": "passage",
                           "refer_220": "referral-committee",
                           "intro_102": ["introduction", "passage"],
                           "intro_105": ["introduction", "passage"],
                           "intro_ref_ctte_100": "referral-committee",
                           "refer_209": None,
                           "intro_108": ["introduction", "passage"],
                           "intro_103": ["introduction", "passage"],
                           "msg_reso_503": "passage",
                           "intro_107": ["introduction", "passage"],
                           "imm_consid_360": "passage",
                           "refer_213": None,
                           "adopt_reso_100": "passage",
                           "adopt_reso_110": "passage",
                           "msg_507": "amendment-passage",
                           "confer_713": None,
                           "concur_603": None,
                           "confer_712": None,
                           "msg_506": "amendment-failure",
                           "receive_message_100": "passage",
                           "motion_920": None,
                           "concur_611": None,
                           "confer_735": None,
                           "third_429": None,
                           "final_501": None,
                           "concur_608": None,
                           }

            base_url = "http://search-prod.lis.state.oh.us"
            first_page = base_url
            first_page += "/solarapi/v1/general_assembly_{session}/".format(session=session)
            legislators = self.get_legislator_ids(first_page)
            all_amendments = self.get_other_data_source(first_page, base_url, "amendments")
            all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals")
            all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss")
            all_analysis = self.get_other_data_source(first_page, base_url, "analysiss")

            for row in self.get_bill_rows(session):
                spacer, number_link, _ga, title, primary_sponsor, status, spacer = row.xpath('td')

                # S.R.No.1 -> SR1
                bill_id = number_link.text_content().replace('No.', '')
                bill_id = bill_id.replace('.', '').replace(' ', '')
                # put one space back in between type and number
                bill_id = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', bill_id)

                title = title.text_content().strip()
                title = re.sub(r'^Title', '', title)

                chamber = 'lower' if 'H' in bill_id else 'upper'
                classification = 'bill' if 'B' in bill_id else 'resolution'

                bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                            title=title, classification=classification)
                bill.add_source(number_link.xpath('a/@href')[0])

                # get bill from API
                bill_api_url = ('http://search-prod.lis.state.oh.us/solarapi/v1/'
                                'general_assembly_{}/{}/{}/'.format(
                                    session,
                                    'bills' if 'B' in bill_id else 'resolutions',
                                    bill_id.lower().replace(' ', '')
                                ))
                data = self.get(bill_api_url).json()

                # add title if no short title
                if not bill.title:
                    bill.title = data['items'][0]['longtitle']
                bill.add_title(data['items'][0]['longtitle'], 'long title')

                # this stuff is version-specific
                for version in data['items']:
                    version_name = version["version"]
                    version_link = base_url+version["pdfDownloadLink"]
                    bill.add_version_link(version_name, version_link, media_type='application/pdf')

                # we'll use latest bill_version for everything else
                bill_version = data['items'][0]
                bill.add_source(bill_api_url)

                # subjects
                for subj in bill_version["subjectindexes"]:
                    try:
                        bill.add_subject(subj["primary"])
                    except KeyError:
                        pass
                    try:
                        secondary_subj = subj["secondary"]
                    except KeyError:
                        secondary_subj = ""
                    if secondary_subj:
                        bill.add_subject(secondary_subj)

                # sponsors
                sponsors = bill_version["sponsors"]
                for sponsor in sponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                                        sponsor_name,
                                        classification='primary',
                                        entity_type='person',
                                        primary=True
                        )

                cosponsors = bill_version["cosponsors"]
                for sponsor in cosponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                                         sponsor_name,
                                         classification='cosponsor',
                                         entity_type='person',
                                         primary=False,
                        )

                try:
                    action_doc = self.get(base_url+bill_version["action"][0]["link"])
                except scrapelib.HTTPError:
                    pass
                else:

                    actions = action_doc.json()
                    for action in reversed(actions["items"]):
                        actor = chamber_dict[action["chamber"]]
                        action_desc = action["description"]
                        try:
                            action_type = action_dict[action["actioncode"]]
                        except KeyError:
                            self.warning("Unknown action {desc} with code {code}."
                                         " Add it to the action_dict"
                                         ".".format(desc=action_desc,
                                                    code=action["actioncode"]))
                            action_type = None

                        date = self._tz.localize(datetime.datetime.strptime(
                                                 action["datetime"],
                                                 "%Y-%m-%dT%H:%M:%S"))
                        date = "{:%Y-%m-%d}".format(date)

                        bill.add_action(action_desc,
                                        date, chamber=actor,
                                        classification=action_type)

                # attach documents gathered earlier
                self.add_document(all_amendments, bill_id, "amendment", bill, base_url)
                self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url)
                self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url)
                self.add_document(all_analysis, bill_id, "analysis", bill, base_url)

                # votes
                vote_url = base_url+bill_version["votes"][0]["link"]
                vote_doc = self.get(vote_url)
                votes = vote_doc.json()
                yield from self.process_vote(votes, vote_url,
                                             base_url, bill, legislators,
                                             chamber_dict, vote_results)

                vote_url = base_url
                vote_url += bill_version["cmtevotes"][0]["link"]
                try:
                    vote_doc = self.get(vote_url)
                except scrapelib.HTTPError:
                    self.warning("Vote page not "
                                 "loading; skipping: {}".format(vote_url))
                    continue
                votes = vote_doc.json()
                yield from self.process_vote(votes, vote_url,
                                             base_url, bill, legislators,
                                             chamber_dict, vote_results)

                if data["items"][0]["effective_date"]:
                    effective_date = datetime.datetime.strptime(data["items"][0]["effective_date"],
                                                                "%Y-%m-%d")
                    effective_date = self._tz.localize(effective_date)
                    # the OH website adds an action that isn't in the action list JSON.
                    # It looks like:
                    # Effective 7/6/18
                    effective_date_oh = "{:%-m/%-d/%y}".format(effective_date)
                    effective_action = "Effective {}".format(effective_date_oh)
                    bill.add_action(effective_action,
                                    effective_date,
                                    chamber="executive",
                                    classification=["became-law"])

                # we have never seen a veto or a disapprove, but they seem important.
                # so we'll check and throw an error if we find one
                # life is fragile. so are our scrapers.
                if "veto" in bill_version:
                    veto_url = base_url+bill_version["veto"][0]["link"]
                    veto_json = self.get(veto_url).json()
                    if len(veto_json["items"]) > 0:
                        raise AssertionError("Whoa, a veto! We've never"
                                             " gotten one before."
                                             " Go write some code to deal"
                                             " with it: {}".format(veto_url))

                if "disapprove" in bill_version:
                    disapprove_url = base_url+bill_version["disapprove"][0]["link"]
                    disapprove_json = self.get(disapprove_url).json()
                    if len(disapprove_json["items"]) > 0:
                        raise AssertionError("Whoa, a disapprove! We've never"
                                             " gotten one before."
                                             " Go write some code to deal "
                                             "with it: {}".format(disapprove_url))

                yield bill
Beispiel #22
0
    def scrape_bill(self, session, bill_url):
        page = self.get(bill_url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(bill_url)

        try:
            bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text
        except IndexError:
            self.logger.warning("Something is wrong with bill page, skipping.")
            return
        secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]')

        # checking if there is a matching bill
        if secondary_bill_id:
            secondary_bill_id = secondary_bill_id[0].text
            # swap ids if * is in secondary_bill_id
            if "*" in secondary_bill_id:
                bill_id, secondary_bill_id = secondary_bill_id, bill_id
                secondary_bill_id = secondary_bill_id.strip()
            secondary_bill_id = secondary_bill_id.replace("  ", " ")

        bill_id = bill_id.replace("*", "").replace("  ", " ").strip()

        if "B" in bill_id:
            bill_type = "bill"
        elif "JR" in bill_id:
            bill_type = "joint resolution"
        elif "R" in bill_id:
            bill_type = "resolution"

        primary_chamber = "lower" if "H" in bill_id else "upper"
        # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower'

        title = page.xpath("//span[@id='lblAbstract']")[0].text
        if title is None:
            msg = "%s detail page was missing title info."
            self.logger.warning(msg % bill_id)
            return

        # bill subject
        subject_pos = title.find("-")
        subjects = [s.strip() for s in title[: subject_pos - 1].split(",")]
        subjects = filter(None, subjects)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=primary_chamber,
            title=title,
            classification=bill_type,
        )
        for subject in subjects:
            bill.add_subject(subject)

        if secondary_bill_id:
            bill.add_identifier(secondary_bill_id)

        if page.xpath('//span[@id="lblCompNumber"]/a'):
            companion_id = page.xpath('//span[@id="lblCompNumber"]/a')[0].text_content().strip()
            bill.add_related_bill(
                identifier=companion_id,
                legislative_session=session,
                relation_type="companion",
            )

        bill.add_source(bill_url)

        # Primary Sponsor
        sponsor = (
            page.xpath("//span[@id='lblBillPrimeSponsor']")[0]
            .text_content()
            .split("by")[-1]
        )
        sponsor = sponsor.replace("*", "").strip()
        if sponsor:
            bill.add_sponsorship(
                sponsor, classification="primary", entity_type="person", primary=True
            )

        # bill text
        btext = page.xpath("//span[@id='lblBillNumber']/a")[0]
        bill.add_version_link(
            "Current Version", btext.get("href"), media_type="application/pdf"
        )

        # documents
        summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]')
        if summary:
            bill.add_document_link("Summary", summary[0].get("href"))
        fiscal = page.xpath('//span[@id="lblFiscalNote"]//a')
        if fiscal:
            bill.add_document_link("Fiscal Note", fiscal[0].get("href"))
        amendments = page.xpath('//a[contains(@href, "/Amend/")]')
        for amendment in amendments:
            bill.add_document_link("Amendment " + amendment.text, amendment.get("href"))
        # amendment notes in image with alt text describing doc inside <a>
        amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]')
        for afn in amend_fns:
            bill.add_document_link(
                afn.get("alt"), afn.getparent().get("href"), on_duplicate="ignore"
            )

        # actions
        atable = page.xpath("//table[@id='gvBillActionHistory']")[0]
        actions_from_table(bill, atable)

        # if there is a matching bill
        if secondary_bill_id:
            # secondary sponsor
            secondary_sponsor = (
                page.xpath("//span[@id='lblCompPrimeSponsor']")[0]
                .text_content()
                .split("by")[-1]
            )
            secondary_sponsor = (
                secondary_sponsor.replace("*", "").replace(")", "").strip()
            )
            # Skip black-name sponsors.
            if secondary_sponsor:
                bill.add_sponsorship(
                    secondary_sponsor,
                    classification="primary",
                    entity_type="person",
                    primary=True,
                )

            # secondary actions
            cotable = page.xpath("//table[@id='gvCoActionHistory']")[0]
            actions_from_table(bill, cotable)

        # votes
        yield from self.scrape_vote_events(bill, page, bill_url)

        bill.actions.sort(key=lambda a: a["date"])
        yield bill
Beispiel #23
0
    def scrape_bill_list(self, url):
        bill_list = self._get_bill_list(url)

        for bill_info in bill_list:

            (bill_id, ) = bill_info.xpath('td[1]/font/input/@value')
            (sponsor, ) = bill_info.xpath('td[2]/font/input/@value')
            (subject, ) = bill_info.xpath('td[3]//text()')
            subject = subject.strip()
            chamber = self.CHAMBERS[bill_id[0]]

            if 'B' in bill_id:
                bill_type = 'bill'
            elif 'JR' in bill_id:
                bill_type = 'joint resolution'
            elif 'R' in bill_id:
                bill_type = 'resolution'
            else:
                raise AssertionError(
                    "Unknown bill type for bill '{}'".format(bill_id))

            bill = Bill(
                bill_id,
                legislative_session=self.session,
                chamber=chamber,
                title='',
                classification=bill_type,
            )
            if subject:
                bill.subject = [subject]
            if sponsor:
                bill.add_sponsorship(
                    name=sponsor,
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )
            bill.add_source(url)

            bill_url = ('http://alisondb.legislature.state.al.us/Alison/'
                        'SESSBillStatusResult.aspx?BILL={}'.format(bill_id))
            bill.add_source(bill_url)

            bill_html = self._get_bill_response(bill_url)
            if bill_html is None:
                self.warning("Bill {} has no webpage, and will be skipped".
                             format(bill_id))
                continue
            bill_doc = lxml.html.fromstring(bill_html)

            if (bill_doc.xpath('//span[@id="ContentPlaceHolder1_lblShotTitle"]')):
                title = bill_doc.xpath(
                    '//span[@id="ContentPlaceHolder1_lblShotTitle"]'
                )[0].text_content().strip()
            if not title:
                title = "[No title given by state]"
            bill.title = title

            version_url_base = (
                'http://alisondb.legislature.state.al.us/ALISON/'
                'SearchableInstruments/{0}/PrintFiles/{1}-'.
                format(self.session, bill_id))
            versions = bill_doc.xpath(
                '//table[@class="box_versions"]/tr/td[2]/font/text()')
            for version in versions:
                name = version
                if version == "Introduced":
                    version_url = version_url_base + 'int.pdf'
                elif version == "Engrossed":
                    version_url = version_url_base + 'eng.pdf'
                elif version == "Enrolled":
                    version_url = version_url_base + 'enr.pdf'
                else:
                    raise NotImplementedError(
                        "Unknown version type found: '{}'".format(name))

                bill.add_version_link(
                    name,
                    version_url,
                    media_type='application/pdf',
                    on_duplicate='ignore',
                )

            # Fiscal notes exist, but I can't figure out how to build their URL
            fiscal_notes = bill_doc.xpath(
                '//table[@class="box_fiscalnote"]')[1:]
            for fiscal_note in fiscal_notes:
                pass

            # Budget Isolation Resolutions are handled as extra actions/votes
            birs = bill_doc.xpath(
                '//div[@class="box_bir"]//table//table/tr')[1:]
            for bir in birs:
                bir_action = bir.xpath('td[1]')[0].text_content().strip()
                # Sometimes ALISON's database puts another bill's
                # actions into the BIR action list; ignore these
                if bill_id not in bir_action:
                    self.warning(
                        "BIR action found ({}) ".format(bir_action) +
                        "that doesn't match the bill ID ({})".format(bill_id))
                    continue

                bir_date = datetime.datetime.strptime(
                    bir.xpath('td[2]/font/text()')[0], self.DATE_FORMAT)
                bir_type = bir.xpath('td[1]/font/text()')[0].split(" ")[0]
                bir_chamber = self.CHAMBERS[bir_type[0]]
                bir_text = "{0}: {1}".format(
                    bir_type, bir.xpath('td[3]/font/text()')[0].strip())

                bill.add_action(
                    bir_text,
                    TIMEZONE.localize(bir_date),
                    chamber=bir_chamber,
                    classification='other',
                )

                try:
                    (bir_vote_id, ) = bir.xpath('td[4]/font/input/@value')
                except ValueError:
                    bir_vote_id = ''

                bir_vote_id = bir_vote_id.strip()
                if bir_vote_id.startswith("Roll "):
                    bir_vote_id = bir_vote_id.split(" ")[-1]

                    yield from self.scrape_vote(
                        bill=bill,
                        vote_chamber=bir_type[0],
                        bill_id="{0}%20for%20{1}".format(bir_type, bill_id),
                        vote_id=bir_vote_id,
                        vote_date=TIMEZONE.localize(bir_date),
                        action_text=bir_text
                    )

            actions = bill_doc.xpath('//table[@id="ContentPlaceHolder1_gvHistory"]/tr')[1:]
            action_date = None
            for action in actions:
                # If actions occur on the same day, only one date will exist
                if (action.xpath('td[1]/font/text()')[0].
                        encode('ascii', 'ignore').strip()):
                    action_date = datetime.datetime.strptime(
                        action.xpath('td[1]/font/text()')[0], self.DATE_FORMAT)

                (action_chamber, ) = action.xpath('td[2]/font/text()')

                if action.xpath('td[3]/font/u/text()'):
                    (amendment, ) = action.xpath('td[3]/font/u/text()')
                else:
                    amendment = None

                (action_text, ) = action.xpath('td[4]/font/text()')

                action_type = _categorize_action(action_text)

                # check for occasional extra last row
                if not action_chamber.strip():
                    continue

                # The committee cell is just an abbreviation, so get its name
                actor = self.CHAMBERS[action_chamber]
                try:
                    action_committee = re.search(
                        r'.*? referred to the .*? committee on (.*?)$',
                        action_text).group(1).strip()
                except AttributeError:
                    action_committee = ''

                act = bill.add_action(
                    action_text,
                    TIMEZONE.localize(action_date),
                    chamber=actor,
                    classification=action_type,
                )
                if action_committee:
                    act.add_related_entity(action_committee, entity_type='organization')

                try:
                    vote_button = action.xpath('td[9]//text()')[0].strip()
                except IndexError:
                    vote_button = ''

                if vote_button.startswith("Roll "):
                    vote_id = vote_button.split(" ")[-1]

                    yield from self.scrape_vote(
                        bill=bill,
                        vote_chamber=action_chamber,
                        bill_id=bill_id,
                        vote_id=vote_id,
                        vote_date=TIMEZONE.localize(action_date),
                        action_text=action_text
                    )

                if amendment:
                    amend_url = (
                        'http://alisondb.legislature.state.al.us/ALISON/'
                        'SearchableInstruments/{0}/PrintFiles/{1}.pdf'.
                        format(self.session, amendment))

                    amend_name = 'Amd/Sub {}'.format(amendment)

                    bill.add_version_link(
                        amend_name,
                        amend_url,
                        media_type='application/pdf',
                        on_duplicate='ignore',
                    )

            yield bill
def test_full_bill():
    create_jurisdiction()
    sp = ScrapePerson('Adam Smith')
    org = ScrapeOrganization(name='House', classification='lower')
    com = ScrapeOrganization(name='Arbitrary Committee',
                             classification='committee',
                             parent_id=org._id)

    oldbill = ScrapeBill('HB 99',
                         '1899',
                         'Axe & Tack Tax Act',
                         classification='tax bill',
                         from_organization=org._id)

    bill = ScrapeBill('HB 1',
                      '1900',
                      'Axe & Tack Tax Act',
                      classification='tax bill',
                      from_organization=org._id)
    bill.subject = ['taxes', 'axes']
    bill.add_identifier('SB 9')
    bill.add_title('Tack & Axe Tax Act')
    bill.add_action('introduced in house', '1900-04-01', chamber='lower')
    act = bill.add_action('sent to arbitrary committee',
                          '1900-04-04',
                          chamber='lower')
    act.add_related_entity('arbitrary committee', 'organization', com._id)
    bill.add_related_bill("HB 99",
                          legislative_session="1899",
                          relation_type="prior-session")
    bill.add_sponsorship('Adam Smith',
                         classification='extra sponsor',
                         entity_type='person',
                         primary=False,
                         entity_id=sp._id)
    bill.add_sponsorship('Jane Smith',
                         classification='lead sponsor',
                         entity_type='person',
                         primary=True)
    bill.add_abstract('This is an act about axes and taxes and tacks.',
                      note="official",
                      date='1969-10-20')
    bill.add_document_link('Fiscal Note',
                           'http://example.com/fn.pdf',
                           media_type='application/pdf')
    bill.add_document_link('Fiscal Note',
                           'http://example.com/fn.html',
                           media_type='text/html')
    bill.add_version_link('Fiscal Note',
                          'http://example.com/v/1',
                          media_type='text/html')
    bill.add_source('http://example.com/source')

    # import bill
    oi = OrganizationImporter('jid')
    oi.import_data([org.as_dict(), com.as_dict()])

    pi = PersonImporter('jid')
    pi.import_data([sp.as_dict()])

    BillImporter('jid', oi,
                 pi).import_data([oldbill.as_dict(),
                                  bill.as_dict()])

    # get bill from db and assert it imported correctly
    b = Bill.objects.get(identifier='HB 1')
    assert b.from_organization.classification == 'lower'
    assert b.identifier == bill.identifier
    assert b.title == bill.title
    assert b.classification == bill.classification
    assert b.subject == ['taxes', 'axes']
    assert b.abstracts.get().note == 'official'
    assert b.abstracts.get().date == '1969-10-20'

    # other_title, other_identifier added
    assert b.other_titles.get().title == 'Tack & Axe Tax Act'
    assert b.other_identifiers.get().identifier == 'SB 9'

    # actions
    actions = list(b.actions.all())
    assert len(actions) == 2
    # ensure order was preserved (if this breaks it'll be intermittent)
    assert actions[0].organization == Organization.objects.get(
        classification='lower')
    assert actions[0].description == "introduced in house"
    assert actions[1].description == "sent to arbitrary committee"
    assert (actions[1].related_entities.get().organization ==
            Organization.objects.get(classification='committee'))

    # related_bills were added
    rb = b.related_bills.get()
    assert rb.identifier == 'HB 99'

    # and bill got resolved
    assert rb.related_bill.identifier == 'HB 99'

    # sponsors added, linked & unlinked
    sponsorships = b.sponsorships.all()
    assert len(sponsorships) == 2
    person = Person.objects.get(name='Adam Smith')
    for ss in sponsorships:
        if ss.primary:
            assert ss.person is None
            assert ss.organization is None
        else:
            assert ss.person == person

    # versions & documents with their links
    versions = b.versions.all()
    assert len(versions) == 1
    assert versions[0].links.count() == 1
    documents = b.documents.all()
    assert len(documents) == 1
    assert documents[0].links.count() == 2

    # sources
    assert b.sources.count() == 1
Beispiel #25
0
    def scrape_bill(self, chamber, session, bill_id, title, url):
        page = self.lxmlize(url)

        if re.match(r'^(S|H)B ', bill_id):
            btype = ['bill']
        elif re.match(r'(S|H)C ', bill_id):
            btype = ['commemoration']
        elif re.match(r'(S|H)JR ', bill_id):
            btype = ['joint resolution']
        elif re.match(r'(S|H)CR ', bill_id):
            btype = ['concurrent resolution']
        else:
            btype = ['bill']

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=btype
                    )
        bill.add_source(url)

        regex_ns = "http://exslt.org/regular-expressions"
        version_links = page.xpath(
            r"//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]",
            namespaces={'re': regex_ns})
        for link in version_links:
            bill.add_version_link(
                link.xpath('string()').strip(),
                link.attrib['href'],
                media_type='text/html',
                on_duplicate='ignore'
            )

        sponsor_links = page.xpath(
            '//div[@id="ctl00_ContentPlaceHolder1_ctl00_BillDetail"]' +
            '/label[contains(text(), "Sponsors:")]' +
            '/following-sibling::div[1]/p/a'
        )
        for link in sponsor_links:
            if link.attrib['href'].startswith('https://sdlegislature.gov/Legislators/'):
                sponsor_type = 'person'
            elif link.attrib['href'].startswith(
                'https://sdlegislature.gov/Legislative_Session/Committees'
            ):
                sponsor_type = 'organization'
            else:
                raise ScrapeError(
                    'Found unexpected sponsor, URL: ' +
                    link.attrib['href']
                )
            bill.add_sponsorship(
                link.text,
                classification='primary',
                primary=True,
                entity_type=sponsor_type
            )

        actor = chamber
        use_row = False

        for row in page.xpath("//table[contains(@id, 'tblBillActions')]//tr"):
            # Some tables have null rows, that are just `<tr></tr>`
            # Eg: sdlegislature.gov/Legislative_Session/Bills/Bill.aspx?Bill=1005&Session=2018
            if row.text_content() == '':
                self.debug('Skipping action table row that is completely empty')
                continue

            if 'Date' in row.text_content() and 'Action' in row.text_content():
                use_row = True
                continue
            elif not use_row:
                continue

            action = row.xpath("string(td[2])").strip()

            atypes = []
            if action.startswith('First read'):
                atypes.append('introduction')
                atypes.append('reading-1')

            if re.match(r'Signed by (?:the\s)*Governor', action, re.IGNORECASE):
                atypes.append('executive-signature')
                actor = 'executive'

            match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)',
                             action)
            if match:
                if match.group(1) in ['Senate',
                                      'House of Representatives']:
                    first = ''
                else:
                    first = 'committee-'
                if match.group(3).lower() == 'passed':
                    second = 'passage'
                elif match.group(3).lower() == 'failed':
                    second = 'failure'
                atypes.append("%s%s" % (first, second))

            if 'referred to' in action.lower():
                atypes.append('referral-committee')

            if 'Motion to amend, Passed Amendment' in action:
                atypes.append('amendment-introduction')
                atypes.append('amendment-passage')
                amd = row.xpath('td[2]/a[contains(@href,"Amendment.aspx")]')[0]
                version_name = amd.xpath('string(.)')
                version_url = amd.xpath('@href')[0]
                if 'htm' in version_url:
                    mimetype = 'text/html'
                elif 'pdf' in version_url:
                    mimetype = 'application/pdf'
                bill.add_version_link(
                    version_name,
                    version_url,
                    media_type=mimetype,
                    on_duplicate='ignore'
                )

            if 'Veto override, Passed' in action:
                atypes.append('veto-override-passage')
            elif 'Veto override, Failed' in action:
                atypes.append('veto-override-failure')

            if 'Delivered to the Governor' in action:
                atypes.append('executive-receipt')

            match = re.match("First read in (Senate|House)", action)
            if match:
                if match.group(1) == 'Senate':
                    actor = 'upper'
                else:
                    actor = 'lower'

            date = row.xpath("string(td[1])").strip()
            match = re.match(r'\d{2}/\d{2}/\d{4}', date)
            if not match:
                self.warning("Bad date: %s" % date)
                continue
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"):
                yield from self.scrape_vote(bill, date, link.attrib['href'])

            bill.add_action(action, date, chamber=actor, classification=atypes)

        for link in page.xpath("//a[contains(@href, 'Keyword')]"):
            bill.add_subject(link.text.strip())

        yield bill
Beispiel #26
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.get(url).text)
        except scrapelib.HTTPError as e:
            self.warning('error (%s) fetching %s, skipping' % (e, url))
            return

        title = page.xpath(
            "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()

        if 'JR' in bill_id:
            bill_type = ['joint resolution']
        elif 'CR' in bill_id:
            bill_type = ['concurrent resolution']
        elif 'R' in bill_id:
            bill_type = ['resolution']
        else:
            bill_type = ['bill']

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=bill_type)
        bill.add_source(url)
        bill.subject = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if ':' in name:
                raise Exception(name)
            if 'otherAuth' in link.attrib['id']:
                bill.add_sponsorship(name, classification='cosponsor',
                                     entity_type='person', primary=False)
            else:
                bill.add_sponsorship(name, classification='primary',
                                     entity_type='person', primary=True)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == 'None':
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == 'H':
                actor = 'lower'
            elif actor == 'S':
                actor = 'upper'

            attrs = self.categorizer.categorize(action)
            related_entities = []
            for item in attrs['committees']:
                related_entities.append({
                    'type': 'committee',
                    'name': item
                })
            for item in attrs['legislators']:
                related_entities.append({
                    'type': 'legislator',
                    'name': item
                })
            bill.add_action(description=action,
                            date=date.strftime('%Y-%m-%d'),
                            chamber=actor,
                            classification=attrs['classification'],
                            related_entities=related_entities)

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        # Keep track of already seen versions to prevent processing duplicates.
        version_urls = []
        for link in version_table.xpath(".//a[contains(@href, '.PDF')]"):
            version_url = link.attrib['href']
            if version_url in version_urls:
                self.warning('Skipping duplicate version URL.')
                continue
            else:
                version_urls.append(version_url)
            name = link.text.strip()

            if re.search('COMMITTEE REPORTS|SCHEDULED CCR', version_url, re.IGNORECASE):
                bill.add_document_link(note=name, url=version_url,
                                       media_type='application/pdf')
                continue

            bill.add_version_link(note=name, url=version_url,
                                  media_type='application/pdf')

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            if 'HT_' not in link.attrib['href']:
                yield from self.scrape_votes(bill, self.urlescape(link.attrib['href']))

        # # If the bill has no actions and no versions, it's a bogus bill on
        # # their website, which appears to happen occasionally. Skip.
        has_no_title = (bill.title == "Short Title Not Found.")
        if has_no_title:
            # If there's no title, this is an empty page. Skip!
            return

        else:
            # Otherwise, save the bills.
            yield bill
Beispiel #27
0
    def scrape_senate_bills(self, chamber, insert, session, year):
        doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution',
                    8: 'joint resolution'}

        for docnum, bill_type in doc_type.items():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/' \
                             'HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count += 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)

                page = self.get(page_path).text
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)

                bill_id = root.xpath('string(/html/body/div[@id="content"]' +
                                     '/table[1]/tr[1]/td[1]/font)')
                title = self.get_node(
                    root,
                    '//div[@id="content"]/table/tr[preceding-sibling::tr/td/'
                    'b[contains(text(), "By:")]]/td/em/text()')

                bill = Bill(bill_id,
                            legislative_session=session,
                            chamber=chamber,
                            title=title,
                            classification=bill_type
                            )
                bill.subject = list(set(self.subject_mapping[bill_id]))

                for table in root.xpath('//div[@id="content"]/table'):
                    if 'Bill Text' in table.text_content():
                        bill_text = table.xpath("string(tr/td[2]/a/@href)")
                        text_url = "http://www.leg.state.nv.us" + bill_text
                        bill.add_version_link(note="Bill Text",
                                              url=text_url,
                                              media_type='application/pdf')

                primary, secondary = self.scrape_sponsors(page)

                for leg in primary:
                    bill.add_sponsorship(name=leg,
                                         classification='primary',
                                         entity_type='person',
                                         primary=True)
                for leg in secondary:
                    bill.add_sponsorship(name=leg,
                                         classification='cosponsor',
                                         entity_type='person',
                                         primary=False)

                minutes_count = 2
                for mr in root.xpath('//table[4]/tr/td[3]/a'):
                    minutes = mr.xpath("string(@href)")
                    minutes_url = "http://www.leg.state.nv.us" + minutes
                    minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                    minutes_date = mr.xpath(minutes_date_path).split()
                    minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Agenda"
                    # bill.add_document(minutes_date, minutes_url)
                    bill.add_document_link(note=minutes_date,
                                           url=minutes_url)
                    minutes_count = minutes_count + 1

                self.scrape_actions(root, bill, "upper")
                yield from self.scrape_votes(page, page_path, bill, insert, year)
                bill.add_source(page_path)
                yield bill
Beispiel #28
0
    def scrape_assem_bills(self, chamber, insert, session, year):

        doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution',
                    6: 'joint resolution', 9: 'petition'}
        for docnum, bill_type in doc_type.items():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/' \
                             'Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link)
                page = self.get(page_path).text
                page = page.replace(u"\xa0", " ")
                root = lxml.html.fromstring(page)
                root.make_links_absolute("http://www.leg.state.nv.us/")

                bill_id = root.xpath('string(/html/body/div[@id="content"]'
                                     '/table[1]/tr[1]/td[1]/font)')
                title = self.get_node(
                    root,
                    '//div[@id="content"]/table/tr[preceding-sibling::tr/td/'
                    'b[contains(text(), "By:")]]/td/em/text()')

                bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                            title=title, classification=bill_type)

                bill.subject = list(set(self.subject_mapping[bill_id]))
                billtext = root.xpath("//b[text()='Bill Text']")[0].getparent().getnext()
                text_urls = billtext.xpath("./a")
                for text_url in text_urls:
                    version_name = text_url.text.strip()
                    version_url = text_url.attrib['href']
                    bill.add_version_link(note=version_name, url=version_url,
                                          media_type='application/pdf')

                primary, secondary = self.scrape_sponsors(page)

                for leg in primary:
                    bill.add_sponsorship(classification='primary',
                                         name=leg, entity_type='person',
                                         primary=True)
                for leg in secondary:
                    bill.add_sponsorship(classification='cosponsor',
                                         name=leg, entity_type='person',
                                         primary=False)

                minutes_count = 2
                for mr in root.xpath('//table[4]/tr/td[3]/a'):
                    minutes = mr.xpath("string(@href)")
                    minutes_url = "http://www.leg.state.nv.us" + minutes
                    minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                    minutes_date = mr.xpath(minutes_date_path).split()
                    minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes"
                    bill.add_document_link(note=minutes_date, url=minutes_url)
                    minutes_count += 1

                self.scrape_actions(root, bill, "lower")
                yield from self.scrape_votes(page, page_path, bill, insert, year)
                bill.add_source(page_path)
                yield bill
Beispiel #29
0
    def scrape_bill(self, chamber, session, bill_id, title, url):
        page = self.lxmlize(url)

        if re.match(r'^(S|H)B ', bill_id):
            btype = ['bill']
        elif re.match(r'(S|H)C ', bill_id):
            btype = ['commemoration']
        elif re.match(r'(S|H)JR ', bill_id):
            btype = ['joint resolution']
        elif re.match(r'(S|H)CR ', bill_id):
            btype = ['concurrent resolution']
        else:
            btype = ['bill']

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=btype
                    )
        bill.add_source(url)

        regex_ns = "http://exslt.org/regular-expressions"
        version_links = page.xpath(
            "//a[re:test(@href, 'Bill.aspx\?File=.*\.htm', 'i')]",
            namespaces={'re': regex_ns})
        for link in version_links:
            bill.add_version_link(
                                link.xpath('string()').strip(),
                                link.attrib['href'],
                                media_type='text/html',
                                on_duplicate='ignore'
                )

        sponsor_links = page.xpath(
            "//td[contains(@id, 'tdSponsors')]/a")
        for link in sponsor_links:
            bill.add_sponsorship(
                    link.text,
                    classification='primary',
                    primary=True,
                    entity_type='person'
                )

        actor = chamber
        use_row = False
        self.debug(bill_id)
        for row in page.xpath("//table[contains(@id, 'BillActions')]/tr"):

            if 'Date' in row.text_content() and 'Action' in row.text_content():
                use_row = True
                continue
            elif not use_row:
                continue

            action = row.xpath("string(td[2])").strip()

            atypes = []
            if action.startswith('First read'):
                atypes.append('introduction')
                atypes.append('reading-1')
            elif action.startswith('Signed by Governor'):
                atypes.append('executive-signature')
                actor = 'executive'

            match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)',
                             action)
            if match:
                if match.group(1) in ['Senate',
                                      'House of Representatives']:
                    first = ''
                else:
                    first = 'committee-'
                if match.group(3).lower() == 'passed':
                    second = 'passage'
                elif match.group(3).lower() == 'failed':
                    second = 'failure'
                atypes.append("%s%s" % (first, second))

            if 'referred to' in action.lower():
                atypes.append('referral-committee')

            if 'Motion to amend, Passed Amendment' in action:
                atypes.append('amendment-introduction')
                atypes.append('amendment-passage')

            if 'Veto override, Passed' in action:
                atypes.append('veto-override-passage')
            elif 'Veto override, Failed' in action:
                atypes.append('veto-override-failure')

            if 'Delivered to the Governor' in action:
                atypes.append('executive-receipt')

            match = re.match("First read in (Senate|House)", action)
            if match:
                if match.group(1) == 'Senate':
                    actor = 'upper'
                else:
                    actor = 'lower'

            date = row.xpath("string(td[1])").strip()
            match = re.match('\d{2}/\d{2}/\d{4}', date)
            if not match:
                self.warning("Bad date: %s" % date)
                continue
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"):
                yield from self.scrape_vote(bill, date, link.attrib['href'])

            bill.add_action(action, date, chamber=actor, classification=atypes)

        for link in page.xpath("//a[contains(@href, 'Keyword')]"):
            bill.add_subject(link.text.strip())

        yield bill
Beispiel #30
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.get(url).text)
        except scrapelib.HTTPError as e:
            self.warning("error (%s) fetching %s, skipping" % (e, url))
            return

        title = page.xpath(
            "string(//span[contains(@id, 'PlaceHolder1_txtST')])"
        ).strip()
        if not title:
            self.warning("blank bill on %s - skipping", url)
            return

        if "JR" in bill_id:
            bill_type = ["joint resolution"]
        elif "CR" in bill_id:
            bill_type = ["concurrent resolution"]
        elif "R" in bill_id:
            bill_type = ["resolution"]
        else:
            bill_type = ["bill"]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )
        bill.add_source(url)
        bill.subject = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if ":" in name:
                raise Exception(name)
            if "otherAuth" in link.attrib["id"]:
                bill.add_sponsorship(
                    name,
                    classification="cosponsor",
                    entity_type="person",
                    primary=False,
                )
            else:
                bill.add_sponsorship(
                    name, classification="primary", entity_type="person", primary=True
                )

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == "None":
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == "H":
                actor = "lower"
            elif actor == "S":
                actor = "upper"

            attrs = self.categorizer.categorize(action)
            related_entities = []
            for item in attrs["committees"]:
                related_entities.append({"type": "committee", "name": item})
            for item in attrs["legislators"]:
                related_entities.append({"type": "legislator", "name": item})
            bill.add_action(
                description=action,
                date=date.strftime("%Y-%m-%d"),
                chamber=actor,
                classification=attrs["classification"],
                related_entities=related_entities,
            )

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        # Keep track of already seen versions to prevent processing duplicates.
        version_urls = []
        for link in version_table.xpath(".//a[contains(@href, '.PDF')]"):
            version_url = link.attrib["href"]
            if version_url in version_urls:
                self.warning("Skipping duplicate version URL.")
                continue
            else:
                version_urls.append(version_url)
            name = link.text.strip()

            if re.search("COMMITTEE REPORTS|SCHEDULED CCR", version_url, re.IGNORECASE):
                bill.add_document_link(
                    note=name, url=version_url, media_type="application/pdf"
                )
                continue

            bill.add_version_link(
                note=name, url=version_url, media_type="application/pdf"
            )

        self.scrape_amendments(bill, page)

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            if "HT_" not in link.attrib["href"]:
                yield from self.scrape_votes(bill, self.urlescape(link.attrib["href"]))

        # # If the bill has no actions and no versions, it's a bogus bill on
        # # their website, which appears to happen occasionally. Skip.
        has_no_title = bill.title == "Short Title Not Found."
        if has_no_title:
            # If there's no title, this is an empty page. Skip!
            return

        else:
            # Otherwise, save the bills.
            yield bill
Beispiel #31
0
    def scrape_bill(self, chamber, session, bill_id):
        # there will be a space in bill_id if we're doing a one-off bill scrape
        # convert HB 102 into H102
        if ' ' in bill_id:
            bill_id = bill_id[0] + bill_id.split(' ')[-1]

        # if chamber comes in as House/Senate convert to lower/upper
        if chamber == 'Senate':
            chamber = 'upper'
        elif chamber == 'House':
            chamber = 'lower'

        bill_detail_url = ('http://www.ncleg.net/gascripts/'
                           'BillLookUp/BillLookUp.pl?Session=%s&BillID=%s&votesToView=all') % (
                               session, bill_id)

        # parse the bill data page, finding the latest html text
        data = self.get(bill_detail_url).text
        doc = lxml.html.fromstring(data)
        doc.make_links_absolute(bill_detail_url)

        title_div_txt = doc.xpath('//div[contains(@class, "h2")]/text()')[0]
        if 'Joint Resolution' in title_div_txt:
            bill_type = 'joint resolution'
            bill_id = bill_id[0] + 'JR ' + bill_id[1:]
        elif 'Resolution' in title_div_txt:
            bill_type = 'resolution'
            bill_id = bill_id[0] + 'R ' + bill_id[1:]
        elif 'Bill' in title_div_txt:
            bill_type = 'bill'
            bill_id = bill_id[0] + 'B ' + bill_id[1:]

        bill_title = doc.xpath(
            '/html/body/div/div/main/div[2]/div[contains(@class,"col-12")]/a')[0]
        bill_title = bill_title.text_content().strip()

        bill = Bill(bill_id, legislative_session=session, title=bill_title, chamber=chamber,
                    classification=bill_type)
        bill.add_source(bill_detail_url)

        # skip first PDF link (duplicate link to cur version)
        if chamber == 'lower':
            link_xpath = '//a[contains(@href, "/Bills/House/PDF/")]'
        else:
            link_xpath = '//a[contains(@href, "/Bills/Senate/PDF/")]'
        for vlink in doc.xpath(link_xpath)[1:]:
            # get the name from the PDF link...
            version_name = vlink.text.replace(u'\xa0', ' ')
            version_url = vlink.attrib['href']

            media_type = 'text/html'
            if version_url.lower().endswith(".pdf"):
                media_type = 'application/pdf'

            bill.add_version_link(version_name, version_url, media_type=media_type,
                                  on_duplicate='ignore')

        # rows with a 'adopted' in the text and an amendment link, skip failed amds
        for row in doc.xpath('//div[@class="card-body"]/div[contains(., "Adopted")'
                             ' and contains(@class,"row")]//a[@title="Amendment"]'):
            version_url = row.xpath('@href')[0]
            version_name = row.xpath('string(.)').strip()
            bill.add_version_link(version_name, version_url, media_type='application/pdf',
                                  on_duplicate='ignore')

        # sponsors
        spon_row = doc.xpath('//div[contains(text(), "Sponsors")]/following-sibling::div')[0]
        # first sponsors are primary, until we see (Primary)
        spon_type = 'primary'
        spon_lines = spon_row.text_content().replace('\r\n', ';')
        for leg in spon_lines.split(';'):
            name = leg.replace(u'\xa0', ' ').strip()
            if name.startswith('(Primary)') or name.endswith('(Primary)'):
                name = name.replace('(Primary)', '').strip()
                spon_type = 'cosponsor'
            if not name:
                continue
            bill.add_sponsorship(name, classification=spon_type, entity_type='person',
                                 primary=(spon_type == 'primary'))

        # keywords
        kw_row = doc.xpath('//div[contains(text(), "Keywords:")]/following-sibling::div')[0]
        for subject in kw_row.text_content().split(', '):
            bill.add_subject(subject)

        # actions
        action_tr_xpath = (
            '//h6[contains(text(), "History")]'
            '/ancestor::div[contains(@class, "gray-card")]'
            '//div[contains(@class, "card-body")]'
            '/div[@class="row"]'
        )
        # skip two header rows
        for row in doc.xpath(action_tr_xpath):
            cols = row.xpath('div')
            act_date = cols[1].text
            actor = cols[3].text or ''
            # if text is blank, try diving in
            action = (cols[5].text or '').strip() or cols[5].text_content().strip()

            act_date = dt.datetime.strptime(act_date, '%m/%d/%Y').strftime('%Y-%m-%d')

            if actor == 'Senate':
                actor = 'upper'
            elif actor == 'House':
                actor = 'lower'
            else:
                actor = 'executive'

            for pattern, atype in self._action_classifiers.items():
                if action.startswith(pattern):
                    break
            else:
                atype = None

            bill.add_action(action, act_date, chamber=actor, classification=atype)

        # TODO: Fix vote scraper
        # yield from self.scrape_votes(bill, doc)

        yield bill
Beispiel #32
0
    def scrape_bill(self, chamber, session, bill_id, short_title=None):
        """
        Scrapes documents, actions, vote counts and votes for
        bills from the 2009 session and above.
        """
        url = BILL_URL % (session, bill_id.replace(' ', ''))
        bill_page = self.get(url, verify=False).text
        html = lxml.html.fromstring(bill_page)
        html.make_links_absolute('http://legislature.idaho.gov/legislation/%s/' % session)
        bill_tables = html.xpath('//table[contains(@class, "bill-table")]')
        title = bill_tables[1].text_content().strip()
        bill_type = get_bill_type(bill_id)
        bill = Bill(legislative_session=session, chamber=chamber, identifier=bill_id, title=title,
                    classification=bill_type)
        bill.add_source(url)
        for subject in self._subjects[bill_id.replace(' ', '')]:
            bill.add_subject(subject)

        if short_title and title.lower() != short_title.lower():
            bill.add_title(short_title, 'short title')

        # documents
        doc_links = html.xpath('//div[contains(@class,"pf-content")]//a')
        for link in doc_links:
            name = link.text_content().strip()
            href = link.get('href')
            if 'Engrossment' in name or 'Bill Text' in name:
                bill.add_version_link(note=name, url=href, media_type="application/pdf")
            else:
                bill.add_document_link(note=name, url=href, media_type="application/pdf")

        def _split(string):
            return re.split(r"\w+[,|AND]\s+", string)

        # sponsors range from a committee to one legislator to a group of legs
        sponsor_lists = bill_tables[0].text_content().split('by')
        if len(sponsor_lists) > 1:
            for sponsors in sponsor_lists[1:]:
                if 'COMMITTEE' in sponsors.upper():
                    bill.add_sponsorship(name=sponsors.strip(), entity_type="organization",
                                         primary=True, classification='primary')
                else:
                    for person in _split(sponsors):
                        person = person.strip()
                        if person != "":
                            bill.add_sponsorship(classification='primary', name=person,
                                                 entity_type="person", primary=True)

        actor = chamber
        last_date = None
        for row in bill_tables[2]:
            # lots of empty rows
            if len(row) == 1:
                continue
            _, date, action, _ = [x.text_content().strip() for x in row]

            if date:
                last_date = date
            else:
                date = last_date
            date = datetime.datetime.strptime(date + '/' + session[0:4],
                                              "%m/%d/%Y").strftime('%Y-%m-%d')
            if action.startswith('House'):
                actor = 'lower'
            elif action.startswith('Senate'):
                actor = 'upper'

            # votes
            if 'AYES' in action or 'NAYS' in action:
                yield from self.parse_vote(actor, date, row[2], session, bill_id, chamber, url)
                # bill.add_vote_event(vote)
            # some td's text is seperated by br elements
            if len(row[2]):
                action = "".join(row[2].itertext())
            action = action.replace(u'\xa0', ' ').strip()
            atype = get_action(actor, action)
            bill.add_action(action, date, chamber=actor, classification=atype)
            # after voice vote/roll call and some actions the bill is sent
            # 'to House' or 'to Senate'
            if 'to House' in action:
                actor = 'lower'
            elif 'to Senate' in action:
                actor = 'upper'
        yield bill
Beispiel #33
0
    def scrape_bills(self, chamber, session, subjects):
        idex = bill_start_numbers(session)[chamber]
        FROM = "ctl00$rilinContent$txtBillFrom"
        TO = "ctl00$rilinContent$txtBillTo"
        YEAR = "ctl00$rilinContent$cbYear"
        blocks = "FOO"  # Ugh.
        while len(blocks) > 0:
            default_headers = get_default_headers(SEARCH_URL)
            default_headers[FROM] = idex
            default_headers[TO] = idex + MAXQUERY
            default_headers[YEAR] = session
            idex += MAXQUERY
            blocks = self.parse_results_page(self.post(SEARCH_URL,
                                             data=default_headers).text)
            blocks = blocks[1:-1]
            blocks = self.digest_results_page(blocks)

            for block in blocks:
                bill = blocks[block]
                subs = []
                try:
                    subs = subjects[bill['bill_id']]
                except KeyError:
                    pass

                title = bill['title'][len("ENTITLED, "):]
                billid = bill['bill_id']
                try:
                    subs = subjects[bill['bill_id']]
                except KeyError:
                    subs = []

                for b in BILL_NAME_TRANSLATIONS:
                    if billid[:len(b)] == b:
                        billid = BILL_NAME_TRANSLATIONS[b] + billid[len(b) + 1:].split()[0]

                b = Bill(
                    billid,
                    title=title,
                    chamber=chamber,
                    legislative_session=session,
                    classification=self.get_type_by_name(bill['bill_id']),
                )
                b.subject = subs

                # keep bill ID around
                self._bill_id_by_type[(chamber, re.findall(r'\d+', billid)[0])] = billid

                self.process_actions(bill['actions'], b)
                sponsors = bill['sponsors'][len("BY"):].strip()
                sponsors = sponsors.split(",")
                sponsors = [s.strip() for s in sponsors]

                for href in bill['bill_id_hrefs']:
                    b.add_version_link(
                        href.text, href.attrib['href'],
                        media_type="application/pdf")

                for sponsor in sponsors:
                    b.add_sponsorship(
                        sponsor, entity_type='person', classification='primary', primary=True)

                b.add_source(SEARCH_URL)
                yield b
Beispiel #34
0
    def scrape_bills(self, session, year_abr):
        # Main Bill information
        main_bill_csv = self.access_to_csv('MainBill')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["Synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(
                bill_id,
                title=title,
                chamber=chamber,
                legislative_session=session,
                classification=self._bill_types[bill_type[1:]],
            )
            if rec['IdenticalBillNumber'].strip():
                bill.add_related_bill(
                    rec['IdenticalBillNumber'].split()[0],
                    legislative_session=session,
                    relation_type='companion',
                )

            # TODO: last session info is in there too
            bill_dict[bill_id] = bill

        # Sponsors
        bill_sponsors_csv = self.access_to_csv('BillSpon')

        for rec in bill_sponsors_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in sponsor database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            name = rec["Sponsor"]
            sponsor_type = rec["Type"]
            if sponsor_type == 'P':
                sponsor_type = "primary"
            else:
                sponsor_type = "cosponsor"
            bill.add_sponsorship(name, classification=sponsor_type, entity_type='person',
                                 primary=sponsor_type == 'primary')

        # Documents
        bill_document_csv = self.access_to_csv('BillWP')

        for rec in bill_document_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in document database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            document = rec["Document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]

            # doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/{}/Bills/{}'.format(
                year_abr,
                document.replace('.DOC', '.HTM'),
            )

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec['DocType']]
            except KeyError:
                raise Exception('unknown doctype %s on %s' %
                                (rec['DocType'], bill_id))
            if rec['Comment']:
                doc_name += ' ' + rec['Comment']

            # Clean HTMX links.
            if htm_url.endswith('HTMX'):
                htm_url = re.sub('X$', '', htm_url)

            if rec['DocType'] in self._version_types:
                if htm_url.endswith('HTM'):
                    mimetype = 'text/html'
                elif htm_url.endswith('wpd'):
                    mimetype = 'application/vnd.wordperfect'
                try:
                    bill.add_version_link(doc_name, htm_url, media_type=mimetype)
                except ValueError:
                    self.warning("Couldn't find a document for bill {}".format(bill_id))
                    pass
            else:
                bill.add_document_link(doc_name, htm_url)

        # Votes
        next_year = int(year_abr) + 1
        vote_info_list = [
            'A%s' % year_abr,
            'A%s' % next_year,
            'S%s' % year_abr,
            'S%s' % next_year,
            'CA%s-%s' % (year_abr, next_year),
            'CS%s-%s' % (year_abr, next_year),
        ]

        for filename in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename
            try:
                s_vote_zip, resp = self.urlretrieve(s_vote_url)
            except scrapelib.FTPError:
                self.warning('could not find %s' % s_vote_url)
                continue
            zippedfile = zipfile.ZipFile(s_vote_zip)
            for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]:
                try:
                    vote_file = io.TextIOWrapper(zippedfile.open(vfile, 'rU'))
                except KeyError:
                    #
                    # Right, so, 2011 we have an "End" file with more
                    # vote data than was in the original dump.
                    #
                    self.warning("No such file: %s" % (vfile))
                    continue

                vdict_file = csv.DictReader(vote_file)

                votes = {}
                if filename.startswith('A') or filename.startswith('CA'):
                    chamber = "lower"
                else:
                    chamber = "upper"

                if filename.startswith('C'):
                    vote_file_type = 'committee'
                else:
                    vote_file_type = 'chamber'

                for rec in vdict_file:
                    if vote_file_type == 'chamber':
                        bill_id = rec["Bill"].strip()
                        leg = rec["Full_Name"]

                        date = rec["Session_Date"]
                        action = rec["Action"]
                        leg_vote = rec["Legislator_Vote"]
                        vote_parts = (bill_id, chamber, action)
                    else:
                        bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number'])
                        leg = rec['Name']
                        # drop time portion
                        date = rec['Agenda_Date'].split()[0]
                        # make motion readable
                        action = self._com_vote_motions[rec['BillAction']]
                        # first char (Y/N) use [0:1] to ignore ''
                        leg_vote = rec['LegislatorVote'][0:1]
                        committee = rec['Committee_House']
                        vote_parts = (bill_id, chamber, action, committee)

                    date = datetime.strptime(date, "%m/%d/%Y")
                    vote_id = '_'.join(vote_parts).replace(' ', '_')

                    if bill_id[0] == 'A':
                        b_chamber = "lower"
                    else:
                        b_chamber = "upper"

                    if vote_id not in votes:
                        votes[vote_id] = VoteEvent(
                            start_date=TIMEZONE.localize(date),
                            chamber=chamber,
                            motion_text=action,
                            classification='passage',
                            result=None,
                            bill=bill_id,
                            bill_chamber=b_chamber,
                            legislative_session=session,
                        )
                    if leg_vote == "Y":
                        votes[vote_id].vote('yes', leg)
                    elif leg_vote == "N":
                        votes[vote_id].vote('no', leg)
                    else:
                        votes[vote_id].vote('other', leg)

            # remove temp file
            os.remove(s_vote_zip)

            # Counts yes/no/other votes and saves overall vote
            for vote in votes.values():
                counts = collections.defaultdict(int)
                for count in vote.votes:
                    counts[count['option']] += 1
                vote.set_count('yes', counts['yes'])
                vote.set_count('no', counts['no'])
                vote.set_count('other', counts['other'])

                # Veto override.
                if vote.motion_text == 'OVERRIDE':
                    # Per the NJ leg's glossary, a veto override requires
                    # 2/3ds of each chamber. 27 in the senate, 54 in the house.
                    # http://www.njleg.state.nj.us/legislativepub/glossary.asp
                    if vote.chamber == 'lower':
                        vote.result = 'pass' if counts['yes'] >= 54 else 'fail'
                    elif vote['chamber'] == 'upper':
                        vote.result = 'pass' if counts['yes'] >= 27 else 'fail'
                else:
                    # Regular vote.
                    vote.result = 'pass' if counts['yes'] > counts['no'] else 'fail'

                vote.add_source('http://www.njleg.state.nj.us/downloads.asp')
                yield vote

        # Actions
        bill_action_csv = self.access_to_csv('BillHist')
        actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'}

        for rec in bill_action_csv:
            bill_type = rec["BillType"].strip()
            bill_number = int(rec["BillNumber"])
            bill_id = bill_type + str(bill_number)
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in action database' % bill_id)
                continue
            bill = bill_dict[bill_id]
            action = rec["Action"]
            date = rec["DateAction"]
            date = datetime.strptime(date, "%m/%d/%y %H:%M:%S")
            actor = actor_map[rec["House"]]
            comment = rec["Comment"]
            action, atype = self.categorize_action(action, bill_id)
            if comment:
                action += (' ' + comment)
            bill.add_action(
                action,
                date=TIMEZONE.localize(date),
                classification=atype,
                chamber=actor,
            )

        # Subjects
        subject_csv = self.access_to_csv('BillSubj')
        for rec in subject_csv:
            bill_id = rec['BillType'].strip() + str(int(rec['BillNumber']))
            if bill_id not in bill_dict:
                self.warning('unknown bill %s in subject database' % bill_id)
                continue
            bill = bill_dict.get(bill_id)
            if bill:
                bill.subject.append(rec['SubjectKey'])
            else:
                self.warning('invalid bill id in BillSubj: %s' % bill_id)

        phony_bill_count = 0
        # save all bills at the end
        for bill in bill_dict.values():
            # add sources
            if not bill.actions and not bill.versions:
                self.warning('probable phony bill detected %s',
                             bill.identifier)
                phony_bill_count += 1
            else:
                bill.add_source('http://www.njleg.state.nj.us/downloads.asp')
                yield bill

        if phony_bill_count:
            self.warning('%s total phony bills detected', phony_bill_count)
    def scrape_bill(self, bill_id):
        old = self.api('bills/' + bill_id + '?')

        # not needed
        old.pop('id')
        old.pop('state')
        old.pop('level', None)
        old.pop('country', None)
        old.pop('created_at')
        old.pop('updated_at')
        old.pop('action_dates')
        old.pop('+bill_type',None)
        old.pop('+subject', None)
        old.pop('+scraped_subjects', None)
        old.pop('subjects', [])

        classification = old.pop('type')

        # ca weirdness
        if 'fiscal committee' in classification:
            classification.remove('fiscal committee')
        if 'urgency' in classification:
            classification.remove('urgency')
        if 'local program' in classification:
            classification.remove('local program')
        if 'tax levy' in classification:
            classification.remove('tax levy')

        if classification[0] in ['miscellaneous', 'jres', 'cres']:
            return

        if classification == ['memorial resolution'] and self.state == 'ar':
            classification = ['memorial']
        if classification == ['concurrent memorial resolution'] and self.state == 'ar':
            classification = ['concurrent memorial']
        if classification == ['joint session resolution'] and self.state == 'il':
            classification = ['joint resolution']
        if classification == ['legislative resolution'] and self.state == 'ny':
            classification = ['resolution']
        if classification == ['address'] and self.state == 'nh':
            classification = ['resolution']

        if not old['title'] and self.state == 'me':
            old['title'] = '(unknown)'

        chamber = old.pop('chamber')
        if self.state in ('ne', 'dc'):
            chamber = 'legislature'
        elif chamber in ('joint', 'conference'):
            chamber = 'legislature'

        new = Bill(old.pop('bill_id'), old.pop('session'), old.pop('title'),
                   chamber=chamber, classification=classification)

        abstract = old.pop('summary', None)
        if abstract:
            new.add_abstract(abstract, note='')

        for title in old.pop('alternate_titles'):
            new.add_title(title)

        for doc in old.pop('documents'):
            new.add_document_link(doc['name'], doc['url'], on_duplicate='ignore')

        for doc in old.pop('versions'):
            new.add_version_link(doc['name'], doc['url'], media_type=doc.pop('mimetype', ''))

        for subj in old.pop('scraped_subjects', []):
            if subj:
                new.add_subject(subj)

        for spon in old.pop('sponsors'):
            if spon.get('committee_id') is not None:
                entity_type = 'organization'
            elif spon.get('leg_id') is not None:
                entity_type = 'person'
            else:
                entity_type = ''
            new.add_sponsorship(spon['name'], spon['type'], entity_type,
                                spon['type'] == 'primary')

        for act in old.pop('actions'):
            actor = act['actor']
            if actor.lower() in ('governor', 'mayor', 'secretary of state'):
                actor = 'executive'
            elif actor.lower() == 'house' or (actor.lower().startswith('lower (') and self.state == 'ca'):
                actor = 'lower'
            elif actor.lower() in ('senate', 'upper`') or (actor.lower().startswith('upper (') and self.state == 'ca'):
                actor = 'upper'
            elif actor in ('joint', 'other', 'Data Systems', 'Speaker', 'clerk',
                           'Office of the Legislative Fiscal Analyst', 'Became Law w',
                           'conference') or (actor.lower().startswith('legislature (') and self.state == 'ca'):
                actor = 'legislature'

            if actor in ('committee', 'sponsor') and self.state == 'pr':
                actor = 'legislature'

            # nebraska & DC
            if actor in ('upper','council') and self.state in ('ne', 'dc'):
                actor = 'legislature'

            if act['action']:
                newact = new.add_action(act['action'], act['date'][:10], chamber=actor,
                                        classification=[action_types[c] for c in act['type'] if c != 'other'])
                for re in act.get('related_entities', []):
                    if re['type'] == 'committee':
                        re['type'] = 'organization'
                    elif re['type'] == 'legislator':
                        re['type'] = 'person'
                    newact.add_related_entity(re['name'], re['type'])

        for comp in old.pop('companions', []):
            if self.state in ('nj', 'ny', 'mn'):
                rtype = 'companion'
            new.add_related_bill(comp['bill_id'], comp['session'], rtype)

        for abid in old.pop('alternate_bill_ids', []) + old.pop('+alternate_bill_ids', []):
            new.add_identifier(abid)


        # generic OpenStates stuff
        for id in old.pop('all_ids'):
            new.add_identifier(id, scheme='openstates')

        for source in old.pop('sources'):
            source.pop('retrieved', None)
            new.add_source(**source)

        ext_title = old.pop('+extended_title', None)
        if ext_title:
            new.add_title(ext_title, note='Extended Title')
        official_title = old.pop('+official_title', None)
        if official_title:
            new.add_title(official_title, note='Official Title')

        to_extras = ['+status', '+final_disposition', '+volume_chapter', '+ld_number', '+referral',
                     '+companion', '+description', '+fiscal_note_probable:',
                     '+preintroduction_required:', '+drafter', '+category:', '+chapter',
                     '+requester', '+transmittal_date:', '+by_request_of', '+bill_draft_number:',
                     '+bill_lr', '+bill_url', '+rcs_num', '+fiscal_note', '+impact_clause', '+fiscal_notes',
                     '+short_title', '+type_', '+conference_committee', 'conference_committee',
                     '+companion_bill_ids', '+additional_information']
        for k in to_extras:
            v = old.pop(k, None)
            if v:
                new.extras[k.replace('+', '')] = v

        # votes
        vote_no = 1
        for vote in old.pop('votes'):
            vote.pop('id')
            vote.pop('state')
            vote.pop('bill_id')
            vote.pop('bill_chamber', None)
            vote.pop('+state', None)
            vote.pop('+country', None)
            vote.pop('+level', None)
            vote.pop('+vacant', None)
            vote.pop('+not_voting', None)
            vote.pop('+amended', None)
            vote.pop('+excused', None)
            vote.pop('+NV', None)
            vote.pop('+AB', None)
            vote.pop('+P', None)
            vote.pop('+V', None)
            vote.pop('+E', None)
            vote.pop('+EXC', None)
            vote.pop('+EMER', None)
            vote.pop('+present', None)
            vote.pop('+absent', None)
            vote.pop('+seconded', None)
            vote.pop('+moved', None)
            vote.pop('+vote_type', None)
            vote.pop('+actual_vote', None)
            vote.pop('+skip_votes', None)
            vote.pop('vote_id')
            vote.pop('+bill_chamber', None)
            vote.pop('+session', None)
            vote.pop('+bill_id', None)
            vote.pop('+bill_session', None)
            vote.pop('committee', None)
            vote.pop('committee_id', None)
            vtype = vote.pop('type', 'passage')

            if vtype == 'veto_override':
                vtype = ['veto-override']
            elif vtype == 'amendment':
                vtype = ['amendment-passage']
            elif vtype == 'other':
                vtype = ''
            else:
                vtype = ['bill-passage']

            # most states need identifiers for uniqueness, just do it everywhere
            identifier = vote['date'] + '-' + str(vote_no)
            vote_no += 1

            chamber = vote.pop('chamber')
            if chamber == 'upper' and self.state in ('ne', 'dc'):
                chamber = 'legislature'
            elif chamber == 'joint':
                chamber = 'legislature'

            newvote = VoteEvent(legislative_session=vote.pop('session'),
                           motion_text=vote.pop('motion'),
                           result='pass' if vote.pop('passed') else 'fail',
                           chamber=chamber,
                           start_date=vote.pop('date'),
                           classification=vtype,
                           bill=new,
                           identifier=identifier)
            for vt in ('yes', 'no', 'other'):
                newvote.set_count(vt, vote.pop(vt + '_count'))
                for name in vote.pop(vt + '_votes'):
                    newvote.vote(vt, name['name'])

            for source in vote.pop('sources'):
                source.pop('retrieved', None)
                newvote.add_source(**source)

            if not newvote.sources:
                newvote.sources = new.sources

            to_extras = ['+record', '+method', 'method', '+filename', 'record', '+action',
                         '+location', '+rcs_num', '+type_', '+threshold', '+other_vote_detail',
                         '+voice_vote']
            for k in to_extras:
                v = vote.pop(k, None)
                if v:
                    newvote.extras[k.replace('+', '')] = v

            assert not vote, vote.keys()
            yield newvote

        assert not old, old.keys()

        yield new
Beispiel #36
0
    def scrape(self, session=None):
        HTML_TAGS_RE = r'<.*?>'

        if session is None:
            session = self.latest_session()

        year_slug = self.jurisdiction.get_year_slug(session)

        # Load all bills and resolutions via the private API
        bills_url = \
            'http://legislature.vermont.gov/bill/loadBillsReleased/{}/'.\
            format(year_slug)
        bills_json = self.get(bills_url).text
        bills = json.loads(bills_json)['data'] or []

        bills_url = \
            'http://legislature.vermont.gov/bill/loadBillsIntroduced/{}/'.\
            format(year_slug)
        bills_json = self.get(bills_url).text
        bills.extend(json.loads(bills_json)['data'] or [])

        resolutions_url = \
            'http://legislature.vermont.gov/bill/loadAllResolutionsByChamber/{}/both'.\
            format(year_slug)
        resolutions_json = self.get(resolutions_url).text
        bills.extend(json.loads(resolutions_json)['data'] or [])

        # Parse the information from each bill
        for info in bills:
            # Strip whitespace from strings
            info = {k: v.strip() for k, v in info.items()}

            # Identify the bill type and chamber
            if info['BillNumber'].startswith('J.R.H.'):
                bill_type = 'joint resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('J.R.S.'):
                bill_type = 'joint resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.C.R.'):
                bill_type = 'concurrent resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('H.R.'):
                bill_type = 'resolution'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.R.'):
                bill_type = 'resolution'
                bill_chamber = 'upper'

            elif info['BillNumber'].startswith('PR.'):
                bill_type = 'constitutional amendment'
                if info['Body'] == 'H':
                    bill_chamber = 'lower'
                elif info['Body'] == 'S':
                    bill_chamber = 'upper'
                else:
                    raise AssertionError("Amendment not tied to chamber")

            elif info['BillNumber'].startswith('H.'):
                bill_type = 'bill'
                bill_chamber = 'lower'
            elif info['BillNumber'].startswith('S.'):
                bill_type = 'bill'
                bill_chamber = 'upper'

            else:
                raise AssertionError(
                    "Unknown bill type found: '{}'".
                    format(info['BillNumber'])
                )

            bill_id = info['BillNumber'].replace('.', '').replace(' ', '')
            # put one space back in between type and number
            bill_id = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', bill_id)

            # Create the bill using its basic information
            bill = Bill(
                identifier=bill_id,
                legislative_session=session,
                chamber=bill_chamber,
                title=info['Title'],
                classification=bill_type
            )
            if 'resolution' in bill_type:
                bill.add_source(resolutions_url)
            else:
                bill.add_source(bills_url)

            # Load the bill's information page to access its metadata
            bill_url = 'http://legislature.vermont.gov/bill/status/{0}/{1}'.\
                format(year_slug, info['BillNumber'])
            doc = self.lxmlize(bill_url)
            bill.add_source(bill_url)

            # Capture sponsors
            sponsors = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Sponsor(s)"]/'
                'following-sibling::dd[1]/ul/li'
            )
            sponsor_type = 'primary'
            for sponsor in sponsors:
                if sponsor.xpath('span/text()') == ['Additional Sponsors']:
                    sponsor_type = 'cosponsor'
                    continue

                sponsor_name = sponsor.xpath('a/text()')[0].\
                    replace("Rep.", "").replace("Sen.", "").strip()
                if sponsor_name and not \
                        (sponsor_name[:5] == "Less" and len(sponsor_name) == 5):
                    bill.add_sponsorship(
                        name=sponsor_name,
                        classification=sponsor_type,
                        entity_type='person',
                        primary=(sponsor_type == 'primary')
                    )

            # Capture bill text versions
            # Warning: There's a TODO in VT's source code saying 'move this to where it used to be'
            # so leave in the old and new positions
            versions = doc.xpath(
                '//dl[@class="summary-table"]/dt[text()="Bill/Resolution Text"]/'
                'following-sibling::dd[1]/ul/li/a |'
                '//ul[@class="bill-path"]//a'
            )

            for version in versions:
                if version.xpath('text()'):
                    bill.add_version_link(
                        note=version.xpath('text()')[0],
                        url=version.xpath('@href')[0].replace(' ', '%20'),
                        media_type='application/pdf'
                    )

            # Identify the internal bill ID, used for actions and votes
            # If there is no internal bill ID, then it has no extra information
            try:
                internal_bill_id = re.search(
                    r'"bill/loadBillDetailedStatus/.+?/(\d+)"',
                    lxml.etree.tostring(doc).decode('utf-8')
                ).group(1)
            except AttributeError:
                self.warning("Bill {} appears to have no activity".format(info['BillNumber']))
                yield bill
                continue

            # Capture actions
            actions_url = 'http://legislature.vermont.gov/bill/loadBillDetailedStatus/{0}/{1}'.\
                format(year_slug, internal_bill_id)
            actions_json = self.get(actions_url).text
            actions = json.loads(actions_json)['data']
            bill.add_source(actions_url)

            chambers_passed = set()
            for action in actions:
                action = {k: v for k, v in action.items() if v is not None}

                if "Signed by Governor" in action['FullStatus']:
                    actor = 'executive'
                elif action['ChamberCode'] == 'H':
                    actor = 'lower'
                elif action['ChamberCode'] == 'S':
                    actor = 'upper'
                else:
                    raise AssertionError("Unknown actor for bill action")

                # Categorize action
                if "Signed by Governor" in action['FullStatus']:
                    # assert chambers_passed == set("HS")
                    action_type = 'executive-signature'
                elif "Vetoed by the Governor" in action['FullStatus']:
                    action_type = 'executive-veto'
                elif "Read first time" in action['FullStatus'] \
                        or "Read 1st time" in action['FullStatus']:
                    action_type = 'introduction'
                elif "Reported favorably" in action['FullStatus']:
                    action_type = 'committee-passage-favorable'
                elif actor == 'lower' and any(x.lower().startswith('aspassed')
                                              for x in action['keywords'].split(';')):
                    action_type = 'passage'
                    chambers_passed.add("H")
                elif actor == 'upper' and any(x.lower().startswith(' aspassed')
                                              or x.lower().startswith('aspassed')
                                              for x in action['keywords'].split(';')):
                    action_type = 'passage'
                    chambers_passed.add("S")
                else:
                    action_type = None

                bill.add_action(
                    description=re.sub(HTML_TAGS_RE, "", action['FullStatus']),
                    date=datetime.datetime.strftime(
                        datetime.datetime.strptime(action['StatusDate'], '%m/%d/%Y'),
                        '%Y-%m-%d'
                    ),
                    chamber=actor,
                    classification=action_type
                )

            # Capture votes
            votes_url = 'http://legislature.vermont.gov/bill/loadBillRollCalls/{0}/{1}'.format(
                year_slug, internal_bill_id)
            votes_json = self.get(votes_url).text
            votes = json.loads(votes_json)['data']
            bill.add_source(votes_url)

            for vote in votes:
                roll_call_id = vote['VoteHeaderID']
                roll_call_url = ('http://legislature.vermont.gov/bill/'
                                 'loadBillRollCallDetails/{0}/{1}'.format(
                                     year_slug, roll_call_id))
                roll_call_json = self.get(roll_call_url).text
                roll_call = json.loads(roll_call_json)['data']

                roll_call_yea = []
                roll_call_nay = []
                roll_call_not_voting = []
                for member in roll_call:
                    (member_name, _district) = member['MemberName'].split(" of ")
                    member_name = member_name.strip()

                    if member['MemberVote'] == "Yea":
                        roll_call_yea.append(member_name)
                    elif member['MemberVote'] == "Nay":
                        roll_call_nay.append(member_name)
                    else:
                        roll_call_not_voting.append(member_name)

                if ("Passed -- " in vote['FullStatus'] or
                        "Veto of Governor overridden" in vote['FullStatus']):
                    did_pass = True
                elif ("Failed -- " in vote['FullStatus'] or
                      'Veto of the Governor sustained' in vote['FullStatus']):
                    did_pass = False
                else:
                    raise AssertionError("Roll call vote result is unclear")

                # Check vote counts
                yea_count = int(re.search(r'Yeas = (\d+)', vote['FullStatus']).group(1))
                nay_count = int(re.search(r'Nays = (\d+)', vote['FullStatus']).group(1))

                vote_to_add = VoteEvent(
                    bill=bill,
                    chamber=('lower' if vote['ChamberCode'] == 'H' else 'upper'),
                    start_date=datetime.datetime.strftime(
                        datetime.datetime.strptime(vote['StatusDate'], '%m/%d/%Y'),
                        '%Y-%m-%d'
                    ),
                    motion_text=re.sub(HTML_TAGS_RE, "", vote['FullStatus']).strip(),
                    result='pass' if did_pass else 'fail',
                    classification='passage',
                    legislative_session=session,
                )
                vote_to_add.add_source(roll_call_url)

                vote_to_add.set_count('yes', yea_count)
                vote_to_add.set_count('no', nay_count)
                vote_to_add.set_count('not voting', len(roll_call_not_voting))

                for member in roll_call_yea:
                    vote_to_add.yes(member)
                for member in roll_call_nay:
                    vote_to_add.no(member)
                for member in roll_call_not_voting:
                    vote_to_add.vote('not voting', member)

                yield vote_to_add

            # Capture extra information-  Not yet implemented
            # Witnesses:
            #   http://legislature.vermont.gov/bill/loadBillWitnessList/{year_slug}/{internal_bill_id}
            # Conference committee members:
            #   http://legislature.vermont.gov/bill/loadBillConference/{year_slug}/{bill_number}
            # Committee meetings:
            #   http://legislature.vermont.gov/committee/loadHistoryByBill/{year_slug}?LegislationId={internal_bill_id}

            yield bill
Beispiel #37
0
    def scrape_bill_type(self, chamber, session, bill_type, type_abbr,
                         committee_abbr_regex=get_committee_name_regex()):
        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(
            measure_type=type_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_id, session, title='', chamber=chamber)
            if ((bill_id.startswith('S') and chamber == 'lower') or
                    (bill_id.startswith('A') and chamber == 'upper')):
                print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
                continue

            # # Construct session for web query, going from '20092010' to '0910'
            # source_session = session[2:4] + session[6:8]

            # # Turn 'AB 10' into 'ab_10'
            # source_num = "%s_%s" % (bill.measure_type.lower(),
            #                         bill.measure_num)

            # Construct a fake source url
            source_url = ('http://leginfo.legislature.ca.gov/faces/'
                          'billNavClient.xhtml?bill_id=%s') % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version_link(bill_id, source_url, media_type='text/html')

            title = ''
            type_ = ['bill']
            subject = ''
            all_titles = set()

            # Get digest test (aka "summary") from latest version.
            if bill.versions:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = '//caml:DigestText/xhtml:p'
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r'\s+', ' ', t)
                    t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t)
                    chunks.append(t)
                summary = '\n\n'.join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                version_date = self._tz.localize(version.bill_version_action_date)

                # create a version name to match the state's format
                # 02/06/17 - Enrolled
                version_date_human = version_date.strftime(
                    '%m/%d/%y')
                version_name = "{} - {}".format(
                    version_date_human, version.bill_version_action)

                version_base = "https://leginfo.legislature.ca.gov/faces"

                version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
                    version_base, version.bill_id, version.bill_version_id)

                fsbill.add_version_link(
                    version_name,
                    version_url_pdf,
                    media_type='application/pdf',
                    date=version_date.date())

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ('AB', 'SB'):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(version.short_title) and \
                            not version.title.lower().startswith('an act'):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == 'Yes':
                    type_.append('appropriation')

                tags = []
                if version.fiscal_committee == 'Yes':
                    tags.append('fiscal committee')
                if version.local_program == 'Yes':
                    tags.append('local program')
                if version.urgency == 'Yes':
                    tags.append('urgency')
                if version.taxlevy == 'Yes':
                    tags.append('tax levy')

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill.title = title
            if summary:
                fsbill.add_abstract(summary, note='summary')
            fsbill.classification = type_
            fsbill.subject = [subject] if subject else []
            fsbill.extras['impact_clause'] = impact_clause
            fsbill.extras['tags'] = tags

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            for title in all_titles:
                fsbill.add_title(title)

            for author in version.authors:
                fsbill.add_sponsorship(
                    author.name,
                    classification=SPONSOR_TYPES[author.contribution],
                    primary=author.primary_author_flg == 'Y',
                    entity_type='person',
                )
                # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}

            seen_actions = set()
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {'Assembly': 'lower',
                             'Senate': 'upper'}[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'executive'
                else:
                    def replacer(matchobj):
                        if matchobj:
                            return {'Assembly': 'lower',
                                    'Senate': 'upper'}[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r'^(Assembly|Senate)', replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r'\s+', ' ', act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if re.search(r'Com[s]?. on', action.action) and not matched_abbrs:
                    msg = 'Failed to extract committee abbr from %r.'
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                            committees.append(name)
                        except KeyError:
                            msg = ('Mapping contains no committee name for '
                                   'abbreviation %r. Action text was %r.')
                            args = (abbr, action.action)
                            raise KeyError(msg % args)

                    committees = filter(None, committees)
                    kwargs['committees'] = committees

                    code = re.search(r'C[SXZ]\d+', actor)
                    if code is not None:
                        code = code.group()
                        kwargs['actor_info'] = {'committee_code': code}

                    assert len(list(committees)) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace('Coms. on ', '')
                        act_str = act_str.replace('Com. on ' + abbr, committee)
                        act_str = act_str.replace(abbr, committee)
                        if not act_str.endswith('.'):
                            act_str = act_str + '.'

                # Determine which chamber the action originated from.
                changed = False
                for committee_chamber in ['upper', 'lower', 'legislature']:
                    if actor.startswith(committee_chamber):
                        actor = committee_chamber
                        changed = True
                        break
                if not changed:
                    actor = 'legislature'

                if actor != action.actor:
                    actor_info = kwargs.get('actor_info', {})
                    actor_info['details'] = action.actor
                    kwargs['actor_info'] = actor_info

                # Add strings for related legislators, if any.
                rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+'
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs['legislators'] = legislators

                date = action.action_date
                date = self._tz.localize(date)
                date = date.date()
                if (actor, act_str, date) in seen_actions:
                    continue

                kwargs.update(self.categorizer.categorize(act_str))

                action = fsbill.add_action(act_str, date.strftime('%Y-%m-%d'), chamber=actor,
                                           classification=kwargs['classification'])
                for committee in kwargs.get('committees', []):
                    action.add_related_entity(
                        committee, entity_type='organization')
                seen_actions.add((actor, act_str, date))

            for vote_num, vote in enumerate(bill.votes):
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                if not vote.location:
                    continue

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                if vote.motion:
                    motion = vote.motion.motion_text or ''
                else:
                    motion = ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ',
                                '', motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$',
                                '', motion)
                motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                                r'Urgency Clause$',
                                '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                # XXX this is responsible for all the CA 'committee' votes, not
                # sure if that's a feature or bug, so I'm leaving it as is...
                # vote_classification = chamber if (vote_location == 'Floor') else 'committee'
                # org = {
                # 'name': vote_location,
                # 'classification': vote_classification
                # }

                fsvote = VoteEvent(
                    motion_text=motion,
                    start_date=self._tz.localize(vote.vote_date_time),
                    result='pass' if result else 'fail',
                    classification=vtype,
                    # organization=org,
                    chamber=vote_chamber,
                    bill=fsbill,
                )
                fsvote.extras = {'threshold': vote.threshold}

                source_url = (
                    'http://leginfo.legislature.ca.gov/faces'
                    '/billVotesClient.xhtml?bill_id={}'
                ).format(fsbill.identifier)
                fsvote.add_source(source_url)
                fsvote.pupa_id = source_url + '#' + str(vote_num)

                rc = {'yes': [], 'no': [], 'other': []}
                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        rc['yes'].append(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        rc['no'].append(record.legislator_name)
                    else:
                        rc['other'].append(record.legislator_name)

                # Handle duplicate votes
                for key in rc.keys():
                    rc[key] = list(set(rc[key]))

                for key, voters in rc.items():
                    for voter in voters:
                        fsvote.vote(key, voter)
                    # Set counts by summed votes for accuracy
                    fsvote.set_count(key, len(voters))

                yield fsvote

            yield fsbill
            self.session.expire_all()
    def scrape_bill(self, chamber, session, bill_id, bill_type, url):
        doc = lxml.html.fromstring(self.get(url).text)
        doc.make_links_absolute(url)

        title = doc.xpath('//b[text()="TITLE:"]')
        if title:
            title = title[0].tail.strip().strip('"')
        else:
            self.warning("skipping bill %s, no information" % url)
            return

        bill = Bill(
            bill_id,
            title=title,
            chamber=chamber,
            classification=bill_type,
            legislative_session=session,
        )
        bill.add_source(url)

        # Get sponsors
        spons_str = doc.xpath(
            '//b[contains(text(), "SPONSOR")]')[0].tail.strip()
        sponsors_match = re.match(
            '(SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str)
        if sponsors_match:
            sponsors = sponsors_match.group(2).split(',')
            sponsor = sponsors[0].strip()

            if sponsor:
                bill.add_sponsorship(
                    sponsors[0],
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )

            for sponsor in sponsors[1:]:
                sponsor = sponsor.strip()
                if sponsor:
                    bill.add_sponsorship(
                        sponsor,
                        entity_type='person',
                        classification='cosponsor',
                        primary=False,
                    )
        else:
            # Committee sponsorship
            spons_str = spons_str.strip()

            if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str):
                spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$', '',
                                   spons_str).title()
                spons_str = (spons_str +
                             " Committee (by request of the governor)")

            if spons_str:
                bill.add_sponsorship(
                    spons_str,
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )

        # Get actions from second myth table
        self._current_comm = None
        act_rows = doc.xpath('(//table[@class="myth"])[2]//tr')[1:]
        for row in act_rows:
            date, journal, raw_chamber, action = row.xpath('td')

            act_date = datetime.datetime.strptime(date.text_content().strip(),
                                                  '%m/%d/%y')
            raw_chamber = raw_chamber.text_content().strip()
            action = action.text_content().strip()

            if raw_chamber == "(H)":
                act_chamber = "lower"
            elif raw_chamber == "(S)":
                act_chamber = "upper"

            if re.match("\w+ Y(\d+)", action):
                vote_href = journal.xpath('.//a/@href')
                if vote_href:
                    yield from self.parse_vote(bill, action, act_chamber,
                                               act_date, vote_href[0])

            action, atype = self.clean_action(action)

            match = re.match('^Prefile released (\d+/\d+/\d+)$', action)
            if match:
                action = 'Prefile released'
                act_date = datetime.datetime.strptime(match.group(1),
                                                      '%m/%d/%y')

            bill.add_action(action,
                            chamber=act_chamber,
                            date=act_date.strftime('%Y-%m-%d'),
                            classification=atype)

        # Get subjects
        for subj in doc.xpath('//a[contains(@href, "subject")]/text()'):
            bill.add_subject(subj.strip())

        # Get versions
        text_list_url = ("http://www.legis.state.ak.us/"
                         "basis/get_fulltext.asp?session=%s&bill=%s") % (
                             session, bill_id)
        bill.add_source(text_list_url)

        text_doc = lxml.html.fromstring(self.get(text_list_url).text)
        text_doc.make_links_absolute(text_list_url)
        for link in text_doc.xpath('//a[contains(@href, "get_bill_text")]'):
            name = link.xpath('../preceding-sibling::td/text()')[0].strip()
            text_url = link.get('href')
            bill.add_version_link(name, text_url, media_type="text/html")

        # Get documents
        doc_list_url = ("http://www.legis.state.ak.us/"
                        "basis/get_documents.asp?session=%s&bill=%s") % (
                            session, bill_id)
        doc_list = lxml.html.fromstring(self.get(doc_list_url).text)
        doc_list.make_links_absolute(doc_list_url)
        bill.add_source(doc_list_url)
        for href in doc_list.xpath(
                '//a[contains(@href, "get_documents")][@onclick]'):
            h_name = href.text_content()
            h_href = href.attrib['href']
            if h_name.strip():
                bill.add_document_link(h_name, h_href)

        yield bill
Beispiel #39
0
    def scrape_bill(self, session, chamber, bill_id, title, url,
                    strip_sponsors=re.compile(r'\s*\(.{,50}\)\s*').sub):

        html = self.get(url).text

        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        bill_type = self.bill_types[bill_id.split()[0][1:]]

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=bill_type)
        bill.add_source(url)

        xpath = ('//strong[contains(., "SUBJECT")]/../'
                 'following-sibling::td/a/text()')
        bill.subject = page.xpath(xpath)

        for version in self.scrape_versions(session, chamber, page, bill_id):
            bill.add_version_link(**version)

        # Resolution pages have different html.
        values = {}
        trs = page.xpath('//div[@id="bhistcontent"]/table/tr')
        for tr in trs:
            heading = tr.xpath('td/strong/text()')
            if heading:
                heading = heading[0]
            else:
                continue
            value = tr.text_content().replace(heading, '').strip()
            values[heading] = value

        # summary was always same as title
        # bill['summary'] = values['SUMMARY:']

        # Add primary sponsor.
        primary = strip_sponsors('', values.get('LEAD SPONSOR:', ''))
        if primary:
            bill.add_sponsorship(
                name=primary,
                classification='primary',
                entity_type='person',
                primary=True
            )

        # Add cosponsors.
        if values.get('SPONSORS:'):
            sponsors = strip_sponsors('', values['SPONSORS:'])
            sponsors = re.split(', (?![A-Z]\.)', sponsors)
            for name in sponsors:
                name = name.strip(', \n\r')
                if name:
                    # Fix name splitting bug where "Neale, D. Hall"
                    match = re.search('(.+?), ([DM]\. Hall)', name)
                    if match:
                        for name in match.groups():
                            bill.add_sponsorship(
                                name=name,
                                classification='cosponsor',
                                entity_type='person',
                                primary=False
                            )
                    else:
                        bill.add_sponsorship(
                            name=name,
                            classification='cosponsor',
                            entity_type='person',
                            primary=False
                        )

        for link in page.xpath("//a[contains(@href, 'votes/house')]"):
            yield from self.scrape_house_vote(bill, link.attrib['href'])

        for tr in reversed(page.xpath("//table[@class='tabborder']/descendant::tr")[1:]):
            tds = tr.xpath('td')
            if len(tds) < 3:
                continue

            chamber_letter = tds[0].text_content()
            chamber = {'S': 'upper', 'H': 'lower'}[chamber_letter]

            # Index of date info no longer varies on resolutions.
            date = tds[2].text_content().strip()
            date = datetime.datetime.strptime(date, "%m/%d/%y").date()

            action = tds[1].text_content().strip()
            if action.lower().startswith('passed senate'):
                for href in tds[1].xpath('a/@href'):
                    yield from self.scrape_senate_vote(bill, href, date)

            attrs = dict(chamber=chamber, description=action, date=date.strftime("%Y-%m-%d"))
            temp = self.categorizer.categorize(action)
            related_entities = []
            for key, values in temp.items():
                if key != 'classification':
                    for value in values:
                        related_entities.append({
                            "type": key,
                            "name": value
                        })
            attrs.update(classification=temp['classification'], related_entities=related_entities)
            bill.add_action(**attrs)

        yield bill
Beispiel #40
0
    def scrape_bill(self,
                    session,
                    chamber,
                    bill_id,
                    title,
                    url,
                    strip_sponsors=re.compile(r'\s*\(.{,50}\)\s*').sub):

        html = self.get(url).text

        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        bill_type = self.bill_types[bill_id.split()[0][1:]]

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=bill_type)
        bill.add_source(url)

        xpath = ('//strong[contains(., "SUBJECT")]/../'
                 'following-sibling::td/a/text()')
        bill.subject = page.xpath(xpath)

        for version in self.scrape_versions(session, chamber, page, bill_id):
            bill.add_version_link(**version)

        self.scrape_amendments(page, bill)

        # Resolution pages have different html.
        values = {}
        trs = page.xpath('//div[@id="bhistcontent"]/table/tr')
        for tr in trs:
            heading = tr.xpath('td/strong/text()')
            if heading:
                heading = heading[0]
            else:
                continue
            value = tr.text_content().replace(heading, '').strip()
            values[heading] = value

        # summary was always same as title
        # bill['summary'] = values['SUMMARY:']

        # Add primary sponsor.
        primary = strip_sponsors('', values.get('LEAD SPONSOR:', ''))
        if primary:
            bill.add_sponsorship(name=primary,
                                 classification='primary',
                                 entity_type='person',
                                 primary=True)

        # Add cosponsors.
        if values.get('SPONSORS:'):
            sponsors = strip_sponsors('', values['SPONSORS:'])
            sponsors = re.split(r', (?![A-Z]\.)', sponsors)
            for name in sponsors:
                name = name.strip(', \n\r')
                if name:
                    # Fix name splitting bug where "Neale, D. Hall"
                    match = re.search(r'(.+?), ([DM]\. Hall)', name)
                    if match:
                        for name in match.groups():
                            bill.add_sponsorship(name=name,
                                                 classification='cosponsor',
                                                 entity_type='person',
                                                 primary=False)
                    else:
                        bill.add_sponsorship(name=name,
                                             classification='cosponsor',
                                             entity_type='person',
                                             primary=False)

        for link in page.xpath("//a[contains(@href, 'votes/house')]"):
            yield from self.scrape_house_vote(bill, link.attrib['href'])

        for tr in reversed(
                page.xpath("//table[@class='tabborder']/descendant::tr")[1:]):
            tds = tr.xpath('td')
            if len(tds) < 3:
                continue

            chamber_letter = tds[0].text_content()
            chamber = {'S': 'upper', 'H': 'lower'}[chamber_letter]

            # Index of date info no longer varies on resolutions.
            date = tds[2].text_content().strip()
            date = datetime.datetime.strptime(date, "%m/%d/%y").date()

            action = tds[1].text_content().strip()
            if action.lower().startswith('passed senate'):
                for href in tds[1].xpath('a/@href'):
                    yield from self.scrape_senate_vote(bill, href, date)

            attrs = dict(chamber=chamber,
                         description=action,
                         date=date.strftime("%Y-%m-%d"))
            temp = self.categorizer.categorize(action)
            related_entities = []
            for key, values in temp.items():
                if key != 'classification':
                    for value in values:
                        related_entities.append({"type": key, "name": value})
            attrs.update(classification=temp['classification'],
                         related_entities=related_entities)
            bill.add_action(**attrs)

        yield bill
Beispiel #41
0
    def _scrape_bill(self, session, bill_data):
        details = self._parse_bill_details(bill_data)

        (senate_url, assembly_url, bill_chamber, bill_type, bill_id,
         title, (prefix, number, active_version)) = details

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=bill_chamber,
            title=title or bill_data['summary'],
            classification=bill_type,
        )

        if bill_data['summary']:
            bill.add_abstract(bill_data['summary'], note='')

        bill_active_version = bill_data['amendments']['items'][active_version]

        # Parse sponsors.
        if bill_data['sponsor'] is not None:
            if bill_data['sponsor']['rules'] is True:
                bill.add_sponsorship(
                    'Rules Committee',
                    entity_type='organization',
                    classification='primary',
                    primary=True,
                )
            elif not bill_data['sponsor']['budget']:
                primary_sponsor = bill_data['sponsor']['member']
                bill.add_sponsorship(
                    primary_sponsor['shortName'],
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )

                # There *shouldn't* be cosponsors if there is no sponsor.
                cosponsors = bill_active_version['coSponsors']['items']
                for cosponsor in cosponsors:
                    bill.add_sponsorship(
                        cosponsor['shortName'],
                        entity_type='person',
                        classification='cosponsor',
                        primary=False,
                    )

        # List companion bill.
        same_as = bill_active_version.get('sameAs', {})
        # Check whether "sameAs" property is populated with at least one bill.
        if same_as['items']:
            # Get companion bill ID.
            companion_bill_id = same_as['items'][0]['basePrintNo']

            # Build companion bill session.
            start_year = same_as['items'][0]['session']
            end_year = start_year + 1
            companion_bill_session = '-'.join([str(start_year), str(end_year)])

            # Attach companion bill data.
            bill.add_related_bill(
                companion_bill_id,
                companion_bill_session,
                relation_type='companion',
            )

        # Parse actions.
        chamber_map = {
            'senate': 'upper',
            'assembly': 'lower',
        }

        for action in bill_data['actions']['items']:
            chamber = chamber_map[action['chamber'].lower()]
            action_datetime = datetime.datetime.strptime(action['date'], '%Y-%m-%d')
            action_date = action_datetime.date()
            types, _ = NYBillScraper.categorizer.categorize(action['text'])

            bill.add_action(
                action['text'],
                action_date.strftime('%Y-%m-%d'),
                chamber=chamber,
                classification=types,
            )

        # Handling of sources follows. Sources serving either chamber
        # maintain duplicate data, so we can see certain bill data
        # through either chamber's resources. However, we have to refer
        # to a specific chamber's resources if we want to grab certain
        # specific information such as vote data.
        #
        # As such, I'm placing all potential sources in the interest of
        # thoroughness. - Andy Lo

        # List Open Legislation API endpoint as a source.
        api_url = self.api_client.root + self.api_client.resources['bill'].format(
            session_year=session,
            bill_id=bill_id,
            summary='',
            detail='')
        bill.add_source(api_url)
        bill.add_source(senate_url)
        bill.add_source(assembly_url)

        # Chamber-specific processing.
        if bill_chamber == 'upper':
            # Collect votes.
            for vote_data in bill_data['votes']['items']:
                yield self._parse_senate_votes(vote_data, bill, api_url)
        elif bill_chamber == 'lower':
            assembly = AssemblyBillPage(self, session, bill, details)
            assembly.build()

        # A little strange the way it works out, but the Assembly
        # provides the HTML version documents and the Senate provides
        # the PDF version documents.
        amendments = bill_data['amendments']['items']
        for key, amendment in amendments.items():
            version = amendment['printNo']

            html_version = version + ' HTML'
            html_url = 'http://assembly.state.ny.us/leg/?sh=printbill&bn='\
                '{}&term={}'.format(bill_id, self.term_start_year)
            bill.add_version_link(
                html_version,
                html_url,
                on_duplicate='ignore',
                media_type='text/html',
            )

            pdf_version = version + ' PDF'
            pdf_url = 'http://legislation.nysenate.gov/pdf/bills/{}/{}'\
                .format(self.term_start_year, bill_id)
            bill.add_version_link(
                pdf_version,
                pdf_url,
                on_duplicate='ignore',
                media_type='application/pdf',
            )

        yield bill
Beispiel #42
0
    def scrape(self):
        unreachable_urls = []

        for leg_summary in self.legislation(created_after=datetime.datetime(2015, 5, 17)) :
            title = leg_summary['Title'].strip()

            if not title or not leg_summary['Intro\xa0Date'] :
                continue
                # https://chicago.legistar.com/LegislationDetail.aspx?ID=1800754&GUID=29575A7A-5489-4D8B-8347-4FC91808B201&Options=Advanced&Search=
                # doesn't have an intro date

            bill_type = BILL_TYPES[leg_summary['Type']]

            bill_session = self.session(self.toTime(leg_summary['Intro\xa0Date']))
            bill = Bill(identifier=leg_summary['Record #'],
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name":"Chicago City Council"})

            bill.add_source(leg_summary['url'])

            try :
                leg_details = self.legDetails(leg_summary['url'])
            except IndexError :
                unreachable_urls.append(leg_summary['url'])
                yield bill
                continue

            for related_bill in leg_details.get('Related files', []) :
                lower_title = title.lower()
                if "sundry" in title or "miscellaneous" in title: #these are ominbus
                    bill.add_related_bill(identifier = related_bill['label'],
                                          legislative_session = bill.legislative_session,
                                          relation_type='replaces')
                #for now we're skipping related bills if they
                #don't contain words that make us think they're
                #in a ominbus relationship with each other
                
            for i, sponsor in enumerate(leg_details.get('Sponsors', [])) :
                if i == 0 :
                    primary = True
                    sponsorship_type = "Primary"
                else :
                    primary = False
                    sponsorship_type = "Regular"

                sponsor_name = sponsor['label']

                # Does the Mayor/Clerk introduce legisislation as
                # individuals role holders or as the OFfice of City
                # Clerk and the Office of the Mayor?
                entity_type = 'person'
                if sponsor_name.startswith(('City Clerk', 
                                            'Mendoza, Susana')) :
                    sponsor_name = 'Office of the City Clerk'
                    entity_type = 'organization'
                elif sponsor_name.startswith(('Emanuel, Rahm',)) :
                    sponsor_name = 'Office of the Mayor'
                    entity_type = 'organization'
                if not sponsor_name.startswith(('Misc. Transmittal',
                                                'No Sponsor',
                                                'Dept./Agency')) :
                    bill.add_sponsorship(sponsor_name, 
                                         sponsorship_type,
                                         entity_type,
                                         primary,
                                         entity_id = _make_pseudo_id(name=sponsor_name))

            if 'Topic' in leg_details :
                for subject in leg_details[u'Topic'].split(',') :
                    bill.add_subject(subject)

            for attachment in leg_details.get('Attachments', []) :
                if attachment['label'] :
                    bill.add_version_link(attachment['label'],
                                          attachment['url'],
                                          media_type="application/pdf")

            for action in self.history(leg_summary['url']) :
                action_description = action['Action']
                try :
                    action_date =  self.toTime(action['Date']).date().isoformat()
                except AttributeError : # https://chicago.legistar.com/LegislationDetail.aspx?ID=1424866&GUID=CEC53337-B991-4268-AE8A-D4D174F8D492
                    continue

                if action_description :
                    try :
                        responsible_org = action['Action\xa0By']['label']
                    except TypeError  :
                        responsible_org = action['Action\xa0By']
                    if responsible_org == 'City Council' :
                        responsible_org = 'Chicago City Council'

                    act = bill.add_action(action_description,
                                          action_date,
                                          organization={'name': responsible_org},
                                          classification=ACTION_CLASSIFICATION[action_description])

                    if action_description == 'Referred' :
                        try :
                            leg_details['Current Controlling Legislative Body']['label']
                            controlling_bodies = [leg_details['Current Controlling Legislative Body']]
                        except TypeError :
                            controlling_bodies = leg_details['Current Controlling Legislative Body']
                        if controlling_bodies :
                            for controlling_body in controlling_bodies :
                                body_name = controlling_body['label']
                                if body_name.startswith("Joint Committee") :
                                    act.add_related_entity(body_name,
                                                           'organization')
                                else :
                                    act.add_related_entity(body_name,
                                                           'organization',
                                                           entity_id = _make_pseudo_id(name=body_name))


                    if 'url' in action['Action\xa0Details'] :
                        action_detail_url = action['Action\xa0Details']['url']
                        result, votes = self.extractVotes(action_detail_url)

                        if votes and result : # see https://github.com/datamade/municipal-scrapers-us/issues/15
                            action_vote = VoteEvent(legislative_session=bill.legislative_session, 
                                               motion_text=action_description,
                                               organization={'name': responsible_org},
                                               classification=None,
                                               start_date=action_date,
                                               result=result,
                                               bill=bill)
                            action_vote.add_source(action_detail_url)

                            for option, voter in votes :
                                action_vote.vote(option, voter)

                            yield action_vote

            bill.extras = {'local_classification' : leg_summary['Type']}
                            
            yield bill
        print(unreachable_urls)
Beispiel #43
0
    def scrape_bill(self, bill_num, session):
        chamber_map = {'House': 'lower', 'Senate': 'upper', 'LSO': 'executive'}
        # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45
        bill_json_url = 'http://wyoleg.gov/LsoService/api/BillInformation/{}/' \
                        '{}?calendarDate='.format(
                            session, bill_num)
        response = self.get(bill_json_url)
        bill_json = json.loads(response.content.decode('utf-8'))

        chamber = 'lower' if bill_json['bill'][0] else 'upper'

        bill = Bill(identifier=bill_json['bill'],
                    legislative_session=session,
                    title=bill_json['catchTitle'],
                    chamber=chamber,
                    classification="bill",
                    )

        bill.add_title(bill_json['billTitle'])

        source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format(session,
                                                                      bill_json['bill'])
        bill.add_source(source_url)

        for action_json in bill_json['billActions']:
            utc_action_date = self.parse_local_date(action_json['statusDate'])

            actor = None
            if action_json['location'] and action_json['location'] in chamber_map:
                actor = chamber_map[action_json['location']]

            action = bill.add_action(
                chamber=actor,
                description=action_json['statusMessage'],
                date=utc_action_date,
                classification=categorize_action(action_json['statusMessage']),
            )

            action.extras = {
                'billInformationID': action_json['billInformationID']}

        if bill_json['introduced']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['introduced'])

            bill.add_version_link(note="Introduced",
                                  url=url,
                                  media_type="application/pdf"  # optional but useful!
                                  )

        if bill_json['enrolledAct']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['enrolledAct'])

            bill.add_version_link(note="Enrolled",
                                  url=url,
                                  media_type="application/pdf"  # optional but useful!
                                  )

        if bill_json['fiscalNote']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['fiscalNote'])

            bill.add_document_link(note="Fiscal Note",
                                   url=url,
                                   media_type="application/pdf"  # optional but useful!
                                   )

        if bill_json['digest']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['digest'])

            bill.add_document_link(note="Bill Digest",
                                   url=url,
                                   media_type="application/pdf"  # optional but useful!
                                   )

        if bill_json['vetoes']:
            for veto in bill_json['vetoes']:
                url = 'http://wyoleg.gov/{}'.format(veto['vetoLinkPath'])
                bill.add_version_link(note=veto['vetoLinkText'],
                                      url=url,
                                      media_type="application/pdf"  # optional but useful!
                                      )

        for amendment in bill_json['amendments']:
            # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf
            url = 'http://wyoleg.gov/{}/Amends/{}.pdf'.format(
                session, amendment['amendmentNumber'])

            if amendment['sponsor'] and amendment['status']:
                title = 'Amendment {} ({}) - {} ({})'.format(
                    amendment['amendmentNumber'],
                    amendment['order'],
                    amendment['sponsor'],
                    amendment['status'],
                )
            else:
                title = 'Amendment {} ({})'.format(
                    amendment['amendmentNumber'],
                    amendment['order'],
                )
            # add versions of the bill text
            version = bill.add_version_link(
                note=title,
                url=url,
                media_type="application/pdf",
            )
            version['extras'] = {
                'amendmentNumber': amendment['amendmentNumber'],
                'sponsor': amendment['sponsor'],
            }

        for sponsor in bill_json['sponsors']:
            status = 'primary' if sponsor['primarySponsor'] else 'cosponsor'
            sponsor_type = 'person' if sponsor['sponsorTitle'] else 'organization'
            bill.add_sponsorship(
                name=sponsor['name'],
                classification=status,
                entity_type=sponsor_type,
                primary=sponsor['primarySponsor']
            )

        if bill_json['summary']:
            bill.add_abstract(
                note="summary",
                abstract=bill_json['summary'],
            )

        if bill_json['enrolledNumber']:
            bill.extras['wy_enrolled_number'] = bill_json['enrolledNumber']

        if bill_json['chapter']:
            bill.extras['chapter'] = bill_json['chapter']

        if bill_json['effectiveDate']:
            eff = datetime.datetime.strptime(
                bill_json['effectiveDate'], '%m/%d/%Y')
            bill.extras['effective_date'] = eff.strftime('%Y-%m-%d')

        bill.extras['wy_bill_id'] = bill_json['id']

        for vote_json in bill_json['rollCalls']:
            yield from self.scrape_vote(bill, vote_json, session)

        yield bill
Beispiel #44
0
    def scrape_bill(self, bill_num, session):
        chamber_map = {'House': 'lower', 'Senate': 'upper', 'LSO': 'executive'}
        # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45
        bill_json_url = 'http://wyoleg.gov/LsoService/api/BillInformation/{}/' \
                        '{}?calendarDate='.format(
                            session, bill_num)
        response = self.get(bill_json_url)
        bill_json = json.loads(response.content.decode('utf-8'))

        chamber = 'lower' if bill_json['bill'][0] else 'upper'

        bill = Bill(
            identifier=bill_json['bill'],
            legislative_session=session,
            title=bill_json['catchTitle'],
            chamber=chamber,
            classification="bill",
        )

        bill.add_title(bill_json['billTitle'])

        source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format(
            session, bill_json['bill'])
        bill.add_source(source_url)

        for action_json in bill_json['billActions']:
            utc_action_date = self.parse_local_date(action_json['statusDate'])

            actor = None
            if action_json['location'] and action_json[
                    'location'] in chamber_map:
                actor = chamber_map[action_json['location']]

            action = bill.add_action(
                chamber=actor,
                description=action_json['statusMessage'],
                date=utc_action_date,
                classification=categorize_action(action_json['statusMessage']),
            )

            action.extras = {
                'billInformationID': action_json['billInformationID']
            }

        if bill_json['introduced']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['introduced'])

            bill.add_version_link(
                note="Introduced",
                url=url,
                media_type="application/pdf"  # optional but useful!
            )

        if bill_json['enrolledAct']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['enrolledAct'])

            bill.add_version_link(
                note="Enrolled",
                url=url,
                media_type="application/pdf"  # optional but useful!
            )

        if bill_json['fiscalNote']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['fiscalNote'])

            bill.add_document_link(
                note="Fiscal Note",
                url=url,
                media_type="application/pdf"  # optional but useful!
            )

        if bill_json['digest']:
            url = 'http://wyoleg.gov/{}'.format(bill_json['digest'])

            bill.add_document_link(
                note="Bill Digest",
                url=url,
                media_type="application/pdf"  # optional but useful!
            )

        if bill_json['vetoes']:
            for veto in bill_json['vetoes']:
                url = 'http://wyoleg.gov/{}'.format(veto['vetoLinkPath'])
                bill.add_version_link(
                    note=veto['vetoLinkText'],
                    url=url,
                    media_type="application/pdf"  # optional but useful!
                )

        for amendment in bill_json['amendments']:
            # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf
            url = 'http://wyoleg.gov/{}/Amends/{}.pdf'.format(
                session, amendment['amendmentNumber'])

            if amendment['sponsor'] and amendment['status']:
                title = 'Amendment {} ({}) - {} ({})'.format(
                    amendment['amendmentNumber'],
                    amendment['order'],
                    amendment['sponsor'],
                    amendment['status'],
                )
            else:
                title = 'Amendment {} ({})'.format(
                    amendment['amendmentNumber'],
                    amendment['order'],
                )
            # add versions of the bill text
            version = bill.add_version_link(
                note=title,
                url=url,
                media_type="application/pdf",
            )
            version['extras'] = {
                'amendmentNumber': amendment['amendmentNumber'],
                'sponsor': amendment['sponsor'],
            }

        for sponsor in bill_json['sponsors']:
            status = 'primary' if sponsor['primarySponsor'] else 'cosponsor'
            sponsor_type = 'person' if sponsor[
                'sponsorTitle'] else 'organization'
            bill.add_sponsorship(name=sponsor['name'],
                                 classification=status,
                                 entity_type=sponsor_type,
                                 primary=sponsor['primarySponsor'])

        if bill_json['summary']:
            bill.add_abstract(
                note="summary",
                abstract=bill_json['summary'],
            )

        if bill_json['enrolledNumber']:
            bill.extras['wy_enrolled_number'] = bill_json['enrolledNumber']

        if bill_json['chapter']:
            bill.extras['chapter'] = bill_json['chapter']

        if bill_json['effectiveDate']:
            eff = datetime.datetime.strptime(bill_json['effectiveDate'],
                                             '%m/%d/%Y')
            bill.extras['effective_date'] = eff.strftime('%Y-%m-%d')

        bill.extras['wy_bill_id'] = bill_json['id']

        for vote_json in bill_json['rollCalls']:
            yield from self.scrape_vote(bill, vote_json, session)

        yield bill
Beispiel #45
0
    def scrape_bill(self, chamber, session, bill_id, bill_type, url):
        doc = lxml.html.fromstring(self.get(url).text)
        doc.make_links_absolute(url)

        title = doc.xpath('//b[text()="TITLE:"]')
        if title:
            title = title[0].tail.strip().strip('"')
        else:
            self.warning("skipping bill %s, no information" % url)
            return

        bill = Bill(
            bill_id,
            title=title,
            chamber=chamber,
            classification=bill_type,
            legislative_session=session,
        )
        bill.add_source(url)

        # Get sponsors
        spons_str = doc.xpath('//b[contains(text(), "SPONSOR")]')[0].tail.strip()
        sponsors_match = re.match(
            '(SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})',
            spons_str)
        if sponsors_match:
            sponsors = sponsors_match.group(2).split(',')
            sponsor = sponsors[0].strip()

            if sponsor:
                bill.add_sponsorship(
                    sponsors[0],
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )

            for sponsor in sponsors[1:]:
                sponsor = sponsor.strip()
                if sponsor:
                    bill.add_sponsorship(
                        sponsor,
                        entity_type='person',
                        classification='cosponsor',
                        primary=False,
                    )
        else:
            # Committee sponsorship
            spons_str = spons_str.strip()

            if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str):
                spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$',
                                   '', spons_str).title()
                spons_str = (spons_str +
                             " Committee (by request of the governor)")

            if spons_str:
                bill.add_sponsorship(
                    spons_str,
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )

        # Get actions from second myth table
        self._current_comm = None
        act_rows = doc.xpath('(//table[@class="myth"])[2]//tr')[1:]
        for row in act_rows:
            date, journal, raw_chamber, action = row.xpath('td')

            act_date = datetime.datetime.strptime(date.text_content().strip(),
                                                  '%m/%d/%y')
            raw_chamber = raw_chamber.text_content().strip()
            action = action.text_content().strip()

            if raw_chamber == "(H)":
                act_chamber = "lower"
            elif raw_chamber == "(S)":
                act_chamber = "upper"

            if re.match("\w+ Y(\d+)", action):
                vote_href = journal.xpath('.//a/@href')
                if vote_href:
                    yield from self.parse_vote(bill, action, act_chamber, act_date,
                                               vote_href[0])

            action, atype = self.clean_action(action)

            match = re.match('^Prefile released (\d+/\d+/\d+)$', action)
            if match:
                action = 'Prefile released'
                act_date = datetime.datetime.strptime(match.group(1), '%m/%d/%y')

            bill.add_action(
                action, chamber=act_chamber, date=act_date.strftime('%Y-%m-%d'),
                classification=atype)

        # Get subjects
        for subj in doc.xpath('//a[contains(@href, "subject")]/text()'):
            bill.add_subject(subj.strip())

        # Get versions
        text_list_url = (
            "http://www.legis.state.ak.us/"
            "basis/get_fulltext.asp?session=%s&bill=%s"
        ) % (session, bill_id)
        bill.add_source(text_list_url)

        text_doc = lxml.html.fromstring(self.get(text_list_url).text)
        text_doc.make_links_absolute(text_list_url)
        for link in text_doc.xpath('//a[contains(@href, "get_bill_text")]'):
            name = link.xpath('../preceding-sibling::td/text()')[0].strip()
            text_url = link.get('href')
            bill.add_version_link(name, text_url, media_type="text/html")

        # Get documents
        doc_list_url = (
            "http://www.legis.state.ak.us/"
            "basis/get_documents.asp?session=%s&bill=%s"
        ) % (session, bill_id)
        doc_list = lxml.html.fromstring(self.get(doc_list_url).text)
        doc_list.make_links_absolute(doc_list_url)
        bill.add_source(doc_list_url)
        for href in doc_list.xpath('//a[contains(@href, "get_documents")][@onclick]'):
            h_name = href.text_content()
            h_href = href.attrib['href']
            if h_name.strip():
                bill.add_document_link(h_name, h_href)

        yield bill
Beispiel #46
0
    def scrape_bill(self, chamber, session, session_id, bill_id, url):
        sidebar = lxml.html.fromstring(self.get(url).text)
        sidebar.make_links_absolute("https://www.legis.iowa.gov")

        hist_url = (
            f"https://www.legis.iowa.gov/legislation/billTracking/"
            f"billHistory?billName={bill_id}&ga={session_id}"
        )
        req_session = requests.Session()
        req = requests.get(hist_url)
        if req.status_code == 500:
            self.warning("500 error on {}, skipping".format(hist_url))
            return

        page = lxml.html.fromstring(req.text)
        page.make_links_absolute("https://www.legis.iowa.gov")

        title = page.xpath(
            'string(//div[@id="content"]/div[@class=' '"divideVert"]/div/div[4]/div[2])'
        ).strip()

        if title == "":
            # Sometimes the title is moved, see
            # https://www.legis.iowa.gov/legislation/billTracking/billHistory?billName=SF%20139&ga=88
            title = page.xpath(
                'string(//div[@id="content"]/div[@class=' '"divideVert"]/div[4]/div[2])'
            ).strip()
            if title == "":
                self.warning("URL: %s gives us an *EMPTY* bill. Aborting." % url)
                return

        if title.lower().startswith("in"):
            title = page.xpath("string(//table[2]/tr[3])").strip()

        if "HR" in bill_id or "SR" in bill_id:
            bill_type = ["resolution"]
        elif "HJR" in bill_id or "SJR" in bill_id:
            bill_type = ["joint resolution"]
        elif "HCR" in bill_id or "SCR" in bill_id:
            bill_type = ["concurrent resolution"]
        else:
            bill_type = ["bill"]

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=chamber,
            title=title,
            classification=bill_type,
        )

        bill.add_source(hist_url)

        # base url for text version (version_abbrev, session_id, bill_id)
        version_html_url_template = (
            "https://www.legis.iowa.gov/docs/"
            "publications/LG{}/{}/attachments/{}.html"
        )
        version_pdf_url_template = (
            "https://www.legis.iowa.gov/docs/" "publications/LG{}/{}/{}.pdf"
        )

        # get pieces of version_link
        vpieces = sidebar.xpath('//select[@id="billVersions"]/option')
        if vpieces:
            for version in vpieces:
                version_name = version.text
                version_abbrev = version.xpath("string(@value)")

                # Get HTML document of bill version.
                version_html_url = version_html_url_template.format(
                    version_abbrev.upper(), session_id, bill_id.replace(" ", "")
                )

                bill.add_version_link(
                    note=version_name, url=version_html_url, media_type="text/html"
                )

                # Get PDF document of bill version.
                version_pdf_url = version_pdf_url_template.format(
                    version_abbrev.upper(), session_id, bill_id.replace(" ", "")
                )

                if "Marked Up" in version_name:
                    version_pdf_url = sidebar.xpath(
                        "//iframe[@id='bbContextDoc']/@src"
                    )[0]

                bill.add_version_link(
                    note=version_name, url=version_pdf_url, media_type="application/pdf"
                )

        sponsors_str = page.xpath(
            'string(//div[@id="content"]/div[@class=' '"divideVert"]/div/div[4]/div[1])'
        ).strip()

        if re.search("^By ", sponsors_str):
            sponsors = re.split(",| and ", sponsors_str.split("By ")[1])
        # for some bills sponsors listed in different format
        else:
            sponsors = re.findall(
                r"[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)", sponsors_str
            )

        for sponsor in sponsors:
            sponsor = sponsor.replace(" and", "").strip(" .,")

            # a few sponsors get mangled by our regex
            sponsor = {
                "Means": "Ways & Means",
                "Iowa": "Economic Growth/Rebuild Iowa",
                "Safety": "Public Safety",
                "Resources": "Human Resources",
                "Affairs": "Veterans Affairs",
                "Protection": "Environmental Protection",
                "Government": "State Government",
                "Boef": "De Boef",
            }.get(sponsor, sponsor)

            if sponsor[0].islower():
                # SSBs catch cruft in it ('charges', 'overpayments')
                # https://sunlight.atlassian.net/browse/DATA-286
                continue

            bill.add_sponsorship(
                name=sponsor,
                classification="primary",
                entity_type="person",
                primary=True,
            )

        for tr in page.xpath(
            "//table[contains(@class, 'billActionTable')][1]/tbody/tr"
        ):
            date = tr.xpath("string(td[contains(text(), ', 20')])").strip()
            if date.startswith("***"):
                continue
            elif "No history is recorded at this time." in date:
                return
            if date == "":
                continue

            date = datetime.datetime.strptime(date, "%B %d, %Y").date()

            action = tr.xpath("string(td[3])").strip()
            action = re.sub(r"\s+", " ", action)

            # Capture any amendment links.
            links = [link for link in [version["links"] for version in bill.versions]]
            version_urls = [link["url"] for link in [i for sub in links for i in sub]]
            if "amendment" in action.lower():
                for anchor in tr.xpath(".//a[1]"):
                    if "-" in anchor.text:
                        # https://www.legis.iowa.gov/docs/publications/AMDI/88/S3071.pdf
                        amd_pattern = "https://www.legis.iowa.gov/docs/publications/AMDI/{}/{}.pdf"
                        amd_id = anchor.text.replace("-", "").strip()
                        amd_url = amd_pattern.format(session_id, amd_id)
                        amd_name = "Amendment {}".format(anchor.text.strip())

                        if amd_url not in version_urls:
                            bill.add_version_link(
                                note=amd_name, url=amd_url, media_type="application/pdf"
                            )
                            version_urls.append(amd_url)
                        else:
                            self.info("Already Added {}, skipping".format(amd_url))

            if "S.J." in action or "SCS" in action:
                actor = "upper"
            elif "H.J." in action or "HCS" in action:
                actor = "lower"
            else:
                actor = "legislature"

            action = re.sub(r"(H|S)\.J\.\s+\d+\.$", "", action).strip()

            if action.startswith("Introduced"):
                atype = ["introduction"]
                if ", referred to" in action:
                    atype.append("referral-committee")
            elif action.startswith("Read first time"):
                atype = "reading-1"
            elif action.startswith("Referred to"):
                atype = "referral-committee"
            elif action.startswith("Sent to Governor"):
                atype = "executive-receipt"
            elif action.startswith("Reported Signed by Governor"):
                atype = "executive-signature"
            elif action.startswith("Signed by Governor"):
                atype = "executive-signature"
            elif action.startswith("Vetoed by Governor"):
                atype = "executive-veto"
            elif action.startswith("Item veto"):
                atype = "executive-veto-line-item"
            elif re.match(r"Passed (House|Senate)", action):
                atype = "passage"
            elif re.match(r"Amendment (S|H)-\d+ filed", action):
                atype = ["amendment-introduction"]
                if ", adopted" in action:
                    atype.append("amendment-passage")
            elif re.match(r"Amendment (S|H)-\d+( as amended,)? adopted", action):
                atype = "amendment-passage"
            elif re.match(r"Amendment (S|N)-\d+ lost", action):
                atype = "amendment-failure"
            elif action.startswith("Resolution filed"):
                atype = "introduction"
            elif action.startswith("Resolution adopted"):
                atype = "passage"
            elif action.startswith("Committee report") and action.endswith("passage."):
                atype = "committee-passage"
            elif action.startswith("Withdrawn"):
                atype = "withdrawal"
            else:
                atype = None

            if action.strip() == "":
                continue

            if re.search(r"END OF \d+ ACTIONS", action):
                continue

            if "$history" not in action:
                bill.add_action(
                    description=action, date=date, chamber=actor, classification=atype
                )

        self.scrape_subjects(bill, bill_id, session, req_session)

        yield bill
Beispiel #47
0
    def scrape(self, window=28, matter_ids=None):
        '''By default, scrape board reports updated in the last 28 days.
        Optionally specify a larger or smaller window of time from which to
        scrape updates, or specific matters to scrape.
        Note that passing a value for :matter_ids supercedes the value of
        :window, such that the given matters will be scraped regardless of
        when they were updated.

        Optional parameters
        :window (numeric) - Amount of time for which to scrape updates, e.g.
        a window of 7 will scrape legislation updated in the last week. Pass
        a window of 0 to scrape all legislation.
        :matter_ids (str) - Comma-separated list of matter IDs to scrape
        '''

        if matter_ids:
            matters = [
                self.matter(matter_id) for matter_id in matter_ids.split(',')
            ]
            matters = filter(
                None, matters)  # Skip matters that are not yet in Legistar
        elif float(window):  # Support for partial days, i.e., window=0.15
            n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
                float(window))
            matters = self.matters(n_days_ago)
        else:
            # Scrape all matters, including those without a last-modified date
            matters = self.matters()

        n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(
            float(window))
        for matter in matters:
            # Skip this bill, until Metro cleans up duplicate in Legistar API
            if matter['MatterFile'] == '2017-0447':
                continue

            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)):
                continue

            # Do not scrape private bills introduced before this timestamp.
            if self._is_restricted(matter) and (
                    date < self.START_DATE_PRIVATE_SCRAPE):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Board of Directors"})

            # The Metro scraper scrapes private bills.
            # However, we do not want to capture significant data about private bills,
            # other than the value of the helper function `_is_restricted` and a last modified timestamp.
            # We yield private bills early, wipe data from previously imported once-public bills,
            # and include only data *required* by the pupa schema.
            # https://github.com/opencivicdata/pupa/blob/master/pupa/scrape/schemas/bill.py
            bill.extras = {'restrict_view': self._is_restricted(matter)}

            # Add API source early.
            # Private bills should have this url for debugging.
            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)
            bill.add_source(legistar_api, note='api')

            if self._is_restricted(matter):
                # required fields
                bill.title = 'Restricted View'

                # wipe old data
                bill.extras['plain_text'] = ''
                bill.extras['rtf_text'] = ''
                bill.sponsorships = []
                bill.related_bills = []
                bill.versions = []
                bill.documents = []
                bill.actions = []

                yield bill
                continue

            legistar_web = matter['legistar_url']
            bill.add_source(legistar_web, note='web')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(
                        body_name,
                        'organization',
                        entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        try:
                            raw_option = vote['VoteValueName'].lower()
                        except AttributeError:
                            raw_option = None
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            for relation in self.relations(matter_id):
                try:
                    # Get data (i.e., json) for the related bill.
                    # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session).
                    # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue.
                    related_bill = self.endpoint(
                        '/matters/{0}', relation['MatterRelationMatterId'])
                except scrapelib.HTTPError:
                    continue
                else:
                    date = related_bill['MatterIntroDate']
                    related_bill_session = self.session(self.toTime(date))
                    identifier = related_bill['MatterFile']
                    bill.add_related_bill(
                        identifier=identifier,
                        legislative_session=related_bill_session,
                        relation_type='companion')
                    # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104
                    # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'.

            bill.add_version_link(
                'Board Report',
                'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'
                .format(matter_id),
                media_type="application/pdf")

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_document_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'].strip(),
                        media_type="application/pdf")

            bill.extras['local_classification'] = matter['MatterTypeName']

            matter_version_value = matter['MatterVersion']
            text = self.text(matter_id, matter_version_value)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Beispiel #48
0
    def scrape_bill_type(self,
                         chamber,
                         session,
                         bill_type,
                         type_abbr,
                         committee_abbr_regex=get_committee_name_regex()):
        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(measure_type=type_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_id, session, title='', chamber=chamber)
            if ((bill_id.startswith('S') and chamber == 'lower')
                    or (bill_id.startswith('A') and chamber == 'upper')):
                print("!!!! BAD ID/CHAMBER PAIR !!!!", bill)
                continue

            # # Construct session for web query, going from '20092010' to '0910'
            # source_session = session[2:4] + session[6:8]

            # # Turn 'AB 10' into 'ab_10'
            # source_num = "%s_%s" % (bill.measure_type.lower(),
            #                         bill.measure_num)

            # Construct a fake source url
            source_url = ('http://leginfo.legislature.ca.gov/faces/'
                          'billNavClient.xhtml?bill_id=%s') % bill.bill_id

            fsbill.add_source(source_url)
            fsbill.add_version_link(bill_id,
                                    source_url,
                                    media_type='text/html')

            title = ''
            type_ = ['bill']
            subject = ''
            all_titles = set()

            # Get digest test (aka "summary") from latest version.
            if bill.versions:
                version = bill.versions[-1]
                nsmap = version.xml.nsmap
                xpath = '//caml:DigestText/xhtml:p'
                els = version.xml.xpath(xpath, namespaces=nsmap)
                chunks = []
                for el in els:
                    t = etree_text_content(el)
                    t = re.sub(r'\s+', ' ', t)
                    t = re.sub(r'\)(\S)', lambda m: ') %s' % m.group(1), t)
                    chunks.append(t)
                summary = '\n\n'.join(chunks)

            for version in bill.versions:
                if not version.bill_xml:
                    continue

                version_date = self._tz.localize(
                    version.bill_version_action_date)

                # create a version name to match the state's format
                # 02/06/17 - Enrolled
                version_date_human = version_date.strftime('%m/%d/%y')
                version_name = "{} - {}".format(version_date_human,
                                                version.bill_version_action)

                version_base = "https://leginfo.legislature.ca.gov/faces"

                version_url_pdf = "{}/billPdf.xhtml?bill_id={}&version={}".format(
                    version_base, version.bill_id, version.bill_version_id)

                fsbill.add_version_link(version_name,
                                        version_url_pdf,
                                        media_type='application/pdf',
                                        date=version_date.date())

                # CA is inconsistent in that some bills have a short title
                # that is longer, more descriptive than title.
                if bill.measure_type in ('AB', 'SB'):
                    impact_clause = clean_title(version.title)
                    title = clean_title(version.short_title)
                else:
                    impact_clause = None
                    if len(version.title) < len(version.short_title) and \
                            not version.title.lower().startswith('an act'):
                        title = clean_title(version.short_title)
                    else:
                        title = clean_title(version.title)

                if title:
                    all_titles.add(title)

                type_ = [bill_type]

                if version.appropriation == 'Yes':
                    type_.append('appropriation')

                tags = []
                if version.fiscal_committee == 'Yes':
                    tags.append('fiscal committee')
                if version.local_program == 'Yes':
                    tags.append('local program')
                if version.urgency == 'Yes':
                    tags.append('urgency')
                if version.taxlevy == 'Yes':
                    tags.append('tax levy')

                if version.subject:
                    subject = clean_title(version.subject)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill.title = title
            if summary:
                fsbill.add_abstract(summary, note='summary')
            fsbill.classification = type_
            fsbill.subject = [subject] if subject else []
            fsbill.extras['impact_clause'] = impact_clause
            fsbill.extras['tags'] = tags

            # We don't want the current title in alternate_titles
            all_titles.remove(title)

            for title in all_titles:
                fsbill.add_title(title)

            for author in version.authors:
                fsbill.add_sponsorship(
                    author.name,
                    classification=SPONSOR_TYPES[author.contribution],
                    primary=author.primary_author_flg == 'Y',
                    entity_type='person',
                )
                # fsbill.sponsorships[-1]['extras'] = {'official_type': author.contribution}

            seen_actions = set()
            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {
                        'Assembly': 'lower',
                        'Senate': 'upper'
                    }[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'executive'
                else:

                    def replacer(matchobj):
                        if matchobj:
                            return {
                                'Assembly': 'lower',
                                'Senate': 'upper'
                            }[matchobj.group()]
                        else:
                            return matchobj.group()

                    actor = re.sub(r'^(Assembly|Senate)', replacer, actor)

                type_ = []

                act_str = action.action
                act_str = re.sub(r'\s+', ' ', act_str)

                attrs = self.categorizer.categorize(act_str)

                # Add in the committee strings of the related committees, if any.
                kwargs = attrs
                matched_abbrs = committee_abbr_regex.findall(action.action)

                if re.search(r'Com[s]?. on',
                             action.action) and not matched_abbrs:
                    msg = 'Failed to extract committee abbr from %r.'
                    self.logger.warning(msg % action.action)

                if matched_abbrs:
                    committees = []
                    for abbr in matched_abbrs:
                        try:
                            name = self.committee_abbr_to_name(chamber, abbr)
                            committees.append(name)
                        except KeyError:
                            msg = ('Mapping contains no committee name for '
                                   'abbreviation %r. Action text was %r.')
                            args = (abbr, action.action)
                            raise KeyError(msg % args)

                    committees = filter(None, committees)
                    kwargs['committees'] = committees

                    code = re.search(r'C[SXZ]\d+', actor)
                    if code is not None:
                        code = code.group()
                        kwargs['actor_info'] = {'committee_code': code}

                    assert len(list(committees)) == len(matched_abbrs)
                    for committee, abbr in zip(committees, matched_abbrs):
                        act_str = act_str.replace('Coms. on ', '')
                        act_str = act_str.replace('Com. on ' + abbr, committee)
                        act_str = act_str.replace(abbr, committee)
                        if not act_str.endswith('.'):
                            act_str = act_str + '.'

                # Determine which chamber the action originated from.
                changed = False
                for committee_chamber in ['upper', 'lower', 'legislature']:
                    if actor.startswith(committee_chamber):
                        actor = committee_chamber
                        changed = True
                        break
                if not changed:
                    actor = 'legislature'

                if actor != action.actor:
                    actor_info = kwargs.get('actor_info', {})
                    actor_info['details'] = action.actor
                    kwargs['actor_info'] = actor_info

                # Add strings for related legislators, if any.
                rgx = r'(?:senator|assembly[mwp][^ .,:;]+)\s+[^ .,:;]+'
                legislators = re.findall(rgx, action.action, re.I)
                if legislators:
                    kwargs['legislators'] = legislators

                date = action.action_date
                date = self._tz.localize(date)
                date = date.date()
                if (actor, act_str, date) in seen_actions:
                    continue

                kwargs.update(self.categorizer.categorize(act_str))

                action = fsbill.add_action(
                    act_str,
                    date.strftime('%Y-%m-%d'),
                    chamber=actor,
                    classification=kwargs['classification'])
                for committee in kwargs.get('committees', []):
                    action.add_related_entity(committee,
                                              entity_type='organization')
                seen_actions.add((actor, act_str, date))

            for vote_num, vote in enumerate(bill.votes):
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                if not vote.location:
                    continue

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    # vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                if vote.motion:
                    motion = vote.motion.motion_text or ''
                else:
                    motion = ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ', '',
                                motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$', '',
                                motion)
                motion = re.sub(
                    r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                    r'Urgency Clause$', '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                # XXX this is responsible for all the CA 'committee' votes, not
                # sure if that's a feature or bug, so I'm leaving it as is...
                # vote_classification = chamber if (vote_location == 'Floor') else 'committee'
                # org = {
                # 'name': vote_location,
                # 'classification': vote_classification
                # }

                fsvote = VoteEvent(
                    motion_text=motion,
                    start_date=self._tz.localize(vote.vote_date_time),
                    result='pass' if result else 'fail',
                    classification=vtype,
                    # organization=org,
                    chamber=vote_chamber,
                    bill=fsbill,
                )
                fsvote.extras = {'threshold': vote.threshold}

                source_url = ('http://leginfo.legislature.ca.gov/faces'
                              '/billVotesClient.xhtml?bill_id={}').format(
                                  fsbill.identifier)
                fsvote.add_source(source_url)
                fsvote.pupa_id = source_url + '#' + str(vote_num)

                rc = {'yes': [], 'no': [], 'other': []}
                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        rc['yes'].append(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        rc['no'].append(record.legislator_name)
                    else:
                        rc['other'].append(record.legislator_name)

                # Handle duplicate votes
                for key in rc.keys():
                    rc[key] = list(set(rc[key]))

                for key, voters in rc.items():
                    for voter in voters:
                        fsvote.vote(key, voter)
                    # Set counts by summed votes for accuracy
                    fsvote.set_count(key, len(voters))

                yield fsvote

            yield fsbill
            self.session.expire_all()
Beispiel #49
0
    def scrape_details(self, bill_detail_url, session, chamber, bill_id):
        """
        Create the Bill and add the information obtained from the provided bill_detail_url.
        and then yield the bill object.
        :param bill_detail_url:
        :param session:
        :param chamber:
        :param bill_id:
        :return:
        """
        page = self.get(bill_detail_url).text

        if 'INVALID BILL NUMBER' in page:
            self.warning('INVALID BILL %s' % bill_detail_url)
            return

        doc = lxml.html.fromstring(page)
        doc.make_links_absolute(bill_detail_url)

        bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0]

        bill_type = bill_div.xpath('span/text()')[0]

        if 'General Bill' in bill_type:
            bill_type = 'bill'
        elif 'Concurrent Resolution' in bill_type:
            bill_type = 'concurrent resolution'
        elif 'Joint Resolution' in bill_type:
            bill_type = 'joint resolution'
        elif 'Resolution' in bill_type:
            bill_type = 'resolution'
        else:
            raise ValueError('unknown bill type: %s' % bill_type)

        # this is fragile, but less fragile than it was
        b = bill_div.xpath('./b[text()="Summary:"]')[0]
        bill_summary = b.getnext().tail.strip()

        bill = Bill(
            bill_id,
            legislative_session=session,  # session name metadata's `legislative_sessions`
            chamber=chamber,  # 'upper' or 'lower'
            title=bill_summary,
            classification=bill_type
        )

        subjects = list(self._subjects[bill_id])

        for subject in subjects:
            bill.add_subject(subject)

        # sponsors
        for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'):
            bill.add_sponsorship(
                name=sponsor,
                classification='primary',
                primary=True,
                entity_type='person'
            )
        for sponsor in doc.xpath('//a[contains(@href, "committee.php")]/text()'):
            sponsor = sponsor.replace(u'\xa0', ' ').strip()
            bill.add_sponsorship(
                name=sponsor,
                classification='primary',
                primary=True,
                entity_type='organization'
            )

        # find versions
        version_url = doc.xpath('//a[text()="View full text"]/@href')[0]
        version_html = self.get(version_url).text
        version_doc = lxml.html.fromstring(version_html)
        version_doc.make_links_absolute(version_url)
        for version in version_doc.xpath('//a[contains(@href, "/prever/")]'):
            # duplicate versions with same date, use first appearance

            bill.add_version_link(
                note=version.text,  # Description of the version from the state;
                                    #  eg, 'As introduced', 'Amended', etc.
                url=version.get('href'),
                on_duplicate='ignore',
                media_type='text/html'  # Still a MIME type
            )

        # actions
        for row in bill_div.xpath('table/tr'):
            date_td, chamber_td, action_td = row.xpath('td')

            date = datetime.datetime.strptime(date_td.text, "%m/%d/%y")
            action_chamber = {'Senate': 'upper',
                              'House': 'lower',
                              None: 'legislature'}[chamber_td.text]

            action = action_td.text_content()
            action = action.split('(House Journal')[0]
            action = action.split('(Senate Journal')[0].strip()

            atype = action_type(action)

            bill.add_action(
                description=action,  # Action description, from the state
                date=date.strftime('%Y-%m-%d'),  # `YYYY-MM-DD` format
                chamber=action_chamber,  # 'upper' or 'lower'
                classification=atype  # Options explained in the next section
            )

        # votes
        vurl = doc.xpath('//a[text()="View Vote History"]/@href')
        if vurl:
            vurl = vurl[0]
            yield from self.scrape_vote_history(bill, vurl)

        bill.add_source(bill_detail_url)
        yield bill
Beispiel #50
0
    def scrape(self):
        three_days_ago = datetime.datetime.now() - datetime.timedelta(3)
        for matter in self.matters(three_days_ago):
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)):
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name": "Board of Directors"})

            legistar_web = self.legislation_detail_url(matter_id)
            legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id):
                act = bill.add_action(**action)

                if action['description'] == 'Referred':
                    body_name = matter['MatterBodyName']
                    act.add_related_entity(
                        body_name,
                        'organization',
                        entity_id=_make_pseudo_id(name=body_name))

                result, votes = vote
                if result:
                    vote_event = VoteEvent(
                        legislative_session=bill.legislative_session,
                        motion_text=action['description'],
                        organization=action['organization'],
                        classification=None,
                        start_date=action['date'],
                        result=result,
                        bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes:
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(
                            raw_option, raw_option)
                        vote_event.vote(clean_option,
                                        vote['VotePersonName'].strip())

                    yield vote_event

            for sponsorship in self.sponsorships(matter_id):
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id):
                bill.add_subject(topic['MatterIndexName'].strip())

            bill.add_version_link(
                'Board Report',
                'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'
                .format(matter_id),
                media_type="application/pdf")

            for attachment in self.attachments(matter_id):
                if attachment['MatterAttachmentName']:
                    bill.add_document_link(
                        attachment['MatterAttachmentName'],
                        attachment['MatterAttachmentHyperlink'],
                        media_type="application/pdf")

            bill.extras = {'local_classification': matter['MatterTypeName']}

            text = self.text(matter_id)

            if text:
                if text['MatterTextPlain']:
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf']:
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(
                        u'\u0000', '')

            yield bill
Beispiel #51
0
    def scrape_bill(self, session, bill_url):
        page = self.get(bill_url).text
        page = lxml.html.fromstring(page)
        page.make_links_absolute(bill_url)

        try:
            bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text
        except IndexError:
            self.logger.warning("Something is wrong with bill page, skipping.")
            return
        secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]')

        # checking if there is a matching bill
        if secondary_bill_id:
            secondary_bill_id = secondary_bill_id[0].text
            # swap ids if * is in secondary_bill_id
            if '*' in secondary_bill_id:
                bill_id, secondary_bill_id = secondary_bill_id, bill_id
                secondary_bill_id = secondary_bill_id.strip()
            secondary_bill_id = secondary_bill_id.replace('  ', ' ')

        bill_id = bill_id.replace('*', '').replace('  ', ' ').strip()

        if 'B' in bill_id:
            bill_type = 'bill'
        elif 'JR' in bill_id:
            bill_type = 'joint resolution'
        elif 'R' in bill_id:
            bill_type = 'resolution'

        primary_chamber = 'lower' if 'H' in bill_id else 'upper'
        # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower'

        title = page.xpath("//span[@id='lblAbstract']")[0].text
        if title is None:
            msg = '%s detail page was missing title info.'
            self.logger.warning(msg % bill_id)
            return

        # bill subject
        subject_pos = title.find('-')
        subjects = [s.strip() for s in title[:subject_pos - 1].split(',')]
        subjects = filter(None, subjects)

        bill = Bill(
            bill_id,
            legislative_session=session,
            chamber=primary_chamber,
            title=title,
            classification=bill_type,
        )
        for subject in subjects:
            bill.add_subject(subject)

        if secondary_bill_id:
            bill.add_identifier(secondary_bill_id)

        bill.add_source(bill_url)

        # Primary Sponsor
        sponsor = page.xpath("//span[@id='lblBillPrimeSponsor']")[0].text_content().split("by")[-1]
        sponsor = sponsor.replace('*', '').strip()
        if sponsor:
            bill.add_sponsorship(
                sponsor,
                classification='primary',
                entity_type='person',
                primary=True,
            )

        # bill text
        btext = page.xpath("//span[@id='lblBillNumber']/a")[0]
        bill.add_version_link('Current Version', btext.get('href'),
                              media_type='application/pdf')

        # documents
        summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]')
        if summary:
            bill.add_document_link('Summary', summary[0].get('href'))
        fiscal = page.xpath('//span[@id="lblFiscalNote"]//a')
        if fiscal:
            bill.add_document_link('Fiscal Note', fiscal[0].get('href'))
        amendments = page.xpath('//a[contains(@href, "/Amend/")]')
        for amendment in amendments:
            bill.add_document_link('Amendment ' + amendment.text, amendment.get('href'))
        # amendment notes in image with alt text describing doc inside <a>
        amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]')
        for afn in amend_fns:
            bill.add_document_link(
                afn.get('alt'),
                afn.getparent().get('href'),
                on_duplicate='ignore'
            )

        # actions
        atable = page.xpath("//table[@id='gvBillActionHistory']")[0]
        actions_from_table(bill, atable)

        # if there is a matching bill
        if secondary_bill_id:
            # secondary sponsor
            secondary_sponsor = page.xpath(
                "//span[@id='lblCompPrimeSponsor']")[0].text_content().split("by")[-1]
            secondary_sponsor = secondary_sponsor.replace('*', '').replace(')', '').strip()
            # Skip black-name sponsors.
            if secondary_sponsor:
                bill.add_sponsorship(
                    secondary_sponsor,
                    classification='primary',
                    entity_type='person',
                    primary=True,
                )

            # secondary actions
            cotable = page.xpath("//table[@id='gvCoActionHistory']")[0]
            actions_from_table(bill, cotable)

        # votes
        yield from self.scrape_vote_events(bill, page, bill_url)

        bill.actions.sort(key=lambda a: a['date'])
        yield bill
Beispiel #52
0
    def scrape(self, session=None, chambers=None):
        # Bills endpoint can sometimes take a very long time to load
        self.timeout = 300

        if not session:
            session = self.latest_session()
            self.info('no session, using %s', session)

        if int(session) < 128:
            raise AssertionError("No data for period {}".format(session))

        elif int(session) < 131:
            # they changed their data format starting in 131st and added
            # an undocumented API
            yield from self.old_scrape(session)

        else:
            chamber_dict = {"Senate": "upper", "House": "lower",
                            "House of Representatives": "lower",
                            "house": "lower", "senate": "upper"}

            # so presumanbly not everything passes, but we haven't
            # seen anything not pass yet, so we'll need to wait
            # till it fails and get the right language in here
            vote_results = {"approved": True,
                            "passed": True,
                            "adopted": True,
                            "true": True,
                            "false": False,
                            "failed": False,
                            True: True,
                            False: False}

            action_dict = {"ref_ctte_100": "referral-committee",
                           "intro_100": "introduction",
                           "pass_300": "passage",
                           "intro_110": "reading-1",
                           "refer_210": "referral-committee",
                           "crpt_301": None,
                           "crpt_317": None,
                           "concur_606": "passage",
                           "pass_301": "passage",
                           "refer_220": "referral-committee",
                           "intro_102": ["introduction", "passage"],
                           "intro_105": ["introduction", "passage"],
                           "intro_ref_ctte_100": "referral-committee",
                           "refer_209": None,
                           "intro_108": ["introduction", "passage"],
                           "intro_103": ["introduction", "passage"],
                           "msg_reso_503": "passage",
                           "intro_107": ["introduction", "passage"],
                           "imm_consid_360": "passage",
                           "refer_213": None,
                           "adopt_reso_100": "passage",
                           "msg_507": "amendment-passage",
                           "confer_713": None,
                           "concur_603": None,
                           "confer_712": None,
                           "msg_506": "amendment-failure",
                           "receive_message_100": "passage",
                           "motion_920": None,
                           "concur_611": None,
                           "confer_735": None
                           }

            base_url = "http://search-prod.lis.state.oh.us"
            first_page = base_url
            first_page += "/solarapi/v1/general_assembly_{session}/".format(session=session)
            legislators = self.get_legislator_ids(first_page)
            all_amendments = self.get_other_data_source(first_page, base_url, "amendments")
            all_fiscals = self.get_other_data_source(first_page, base_url, "fiscals")
            all_synopsis = self.get_other_data_source(first_page, base_url, "synopsiss")
            all_analysis = self.get_other_data_source(first_page, base_url, "analysiss")

            for row in self.get_bill_rows(session):
                number_link, ga, title, primary_sponsor, status = row.xpath('td')

                bill_id = number_link.text_content()
                title = title.text_content().strip()
                chamber = 'lower' if 'H' in bill_id else 'upper'
                classification = 'bill' if 'B' in bill_id else 'resolution'

                bill = Bill(bill_id, legislative_session=session, chamber=chamber,
                            title=title, classification=classification)
                bill.add_source(number_link.xpath('a/@href')[0])

                # get bill from API
                bill_api_url = ('http://search-prod.lis.state.oh.us/solarapi/v1/'
                                'general_assembly_{}/{}/{}/'.format(
                                    session,
                                    'bills' if 'B' in bill_id else 'resolutions',
                                    bill_id.lower().replace(' ', '')
                                ))
                data = self.get(bill_api_url).json()

                # add title if no short title
                if not bill.title:
                    bill.title = data['items'][0]['longtitle']
                bill.add_title(data['items'][0]['longtitle'], 'long title')

                # this stuff is version-specific
                for version in data['items']:
                    version_name = version["version"]
                    version_link = base_url+version["pdfDownloadLink"]
                    bill.add_version_link(version_name, version_link, media_type='application/pdf')

                # we'll use latest bill_version for everything else
                bill_version = data['items'][0]
                bill.add_source(bill_api_url)

                # subjects
                for subj in bill_version["subjectindexes"]:
                    try:
                        bill.add_subject(subj["primary"])
                    except KeyError:
                        pass
                    try:
                        secondary_subj = subj["secondary"]
                    except KeyError:
                        secondary_subj = ""
                    if secondary_subj:
                        bill.add_subject(secondary_subj)

                # sponsors
                sponsors = bill_version["sponsors"]
                for sponsor in sponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                                        sponsor_name,
                                        classification='primary',
                                        entity_type='person',
                                        primary=True
                        )

                cosponsors = bill_version["cosponsors"]
                for sponsor in cosponsors:
                    sponsor_name = self.get_sponsor_name(sponsor)
                    bill.add_sponsorship(
                                         sponsor_name,
                                         classification='cosponsor',
                                         entity_type='person',
                                         primary=False,
                        )

                try:
                    action_doc = self.get(base_url+bill_version["action"][0]["link"])
                except scrapelib.HTTPError:
                    pass
                else:

                    actions = action_doc.json()
                    for action in reversed(actions["items"]):
                        actor = chamber_dict[action["chamber"]]
                        action_desc = action["description"]
                        try:
                            action_type = action_dict[action["actioncode"]]
                        except KeyError:
                            self.warning("Unknown action {desc} with code {code}."
                                         " Add it to the action_dict"
                                         ".".format(desc=action_desc,
                                                    code=action["actioncode"]))
                            action_type = None

                        date = self._tz.localize(datetime.datetime.strptime(
                                                 action["datetime"],
                                                 "%Y-%m-%dT%H:%M:%S"))
                        date = "{:%Y-%m-%d}".format(date)

                        bill.add_action(action_desc,
                                        date, chamber=actor,
                                        classification=action_type)

                # attach documents gathered earlier
                self.add_document(all_amendments, bill_id, "amendment", bill, base_url)
                self.add_document(all_fiscals, bill_id, "fiscal", bill, base_url)
                self.add_document(all_synopsis, bill_id, "synopsis", bill, base_url)
                self.add_document(all_analysis, bill_id, "analysis", bill, base_url)

                # votes
                vote_url = base_url+bill_version["votes"][0]["link"]
                vote_doc = self.get(vote_url)
                votes = vote_doc.json()
                yield from self.process_vote(votes, vote_url,
                                             base_url, bill, legislators,
                                             chamber_dict, vote_results)

                vote_url = base_url
                vote_url += bill_version["cmtevotes"][0]["link"]
                try:
                    vote_doc = self.get(vote_url)
                except scrapelib.HTTPError:
                    self.warning("Vote page not "
                                 "loading; skipping: {}".format(vote_url))
                    continue
                votes = vote_doc.json()
                yield from self.process_vote(votes, vote_url,
                                             base_url, bill, legislators,
                                             chamber_dict, vote_results)

                # we have never seen a veto or a disapprove, but they seem important.
                # so we'll check and throw an error if we find one
                # life is fragile. so are our scrapers.
                if "veto" in bill_version:
                    veto_url = base_url+bill_version["veto"][0]["link"]
                    veto_json = self.get(veto_url).json()
                    if len(veto_json["items"]) > 0:
                        raise AssertionError("Whoa, a veto! We've never"
                                             " gotten one before."
                                             " Go write some code to deal"
                                             " with it: {}".format(veto_url))

                if "disapprove" in bill_version:
                    disapprove_url = base_url+bill_version["disapprove"][0]["link"]
                    disapprove_json = self.get(disapprove_url).json()
                    if len(disapprove_json["items"]) > 0:
                        raise AssertionError("Whoa, a disapprove! We've never"
                                             " gotten one before."
                                             " Go write some code to deal "
                                             "with it: {}".format(disapprove_url))

                yield bill
Beispiel #53
0
    def scrape_bill(self, chamber, session, bill_id, url):
        try:
            page = lxml.html.fromstring(self.get(url).text)
        except scrapelib.HTTPError as e:
            self.warning('error (%s) fetching %s, skipping' % (e, url))
            return

        title = page.xpath(
            "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip()
        if not title:
            self.warning('blank bill on %s - skipping', url)
            return

        if 'JR' in bill_id:
            bill_type = ['joint resolution']
        elif 'CR' in bill_id:
            bill_type = ['concurrent resolution']
        elif 'R' in bill_id:
            bill_type = ['resolution']
        else:
            bill_type = ['bill']

        bill = Bill(bill_id,
                    legislative_session=session,
                    chamber=chamber,
                    title=title,
                    classification=bill_type)
        bill.add_source(url)
        bill.subject = self.subject_map[bill_id]

        for link in page.xpath("//a[contains(@id, 'Auth')]"):
            name = link.xpath("string()").strip()

            if ':' in name:
                raise Exception(name)
            if 'otherAuth' in link.attrib['id']:
                bill.add_sponsorship(name, classification='cosponsor',
                                     entity_type='person', primary=False)
            else:
                bill.add_sponsorship(name, classification='primary',
                                     entity_type='person', primary=True)

        act_table = page.xpath("//table[contains(@id, 'Actions')]")[0]
        for tr in act_table.xpath("tr")[2:]:
            action = tr.xpath("string(td[1])").strip()
            if not action or action == 'None':
                continue

            date = tr.xpath("string(td[3])").strip()
            date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

            actor = tr.xpath("string(td[4])").strip()
            if actor == 'H':
                actor = 'lower'
            elif actor == 'S':
                actor = 'upper'

            attrs = self.categorizer.categorize(action)
            related_entities = []
            for item in attrs['committees']:
                related_entities.append({
                    'type': 'committee',
                    'name': item
                })
            for item in attrs['legislators']:
                related_entities.append({
                    'type': 'legislator',
                    'name': item
                })
            bill.add_action(description=action,
                            date=date.strftime('%Y-%m-%d'),
                            chamber=actor,
                            classification=attrs['classification'],
                            related_entities=related_entities)

        version_table = page.xpath("//table[contains(@id, 'Versions')]")[0]
        # Keep track of already seen versions to prevent processing duplicates.
        version_urls = []
        for link in version_table.xpath(".//a[contains(@href, '.PDF')]"):
            version_url = link.attrib['href']
            if version_url in version_urls:
                self.warning('Skipping duplicate version URL.')
                continue
            else:
                version_urls.append(version_url)
            name = link.text.strip()

            if re.search('COMMITTEE REPORTS|SCHEDULED CCR', version_url, re.IGNORECASE):
                bill.add_document_link(note=name, url=version_url,
                                       media_type='application/pdf')
                continue

            bill.add_version_link(note=name, url=version_url,
                                  media_type='application/pdf')

        for link in page.xpath(".//a[contains(@href, '_VOTES')]"):
            if 'HT_' not in link.attrib['href']:
                yield from self.scrape_votes(bill, self.urlescape(link.attrib['href']))

        # # If the bill has no actions and no versions, it's a bogus bill on
        # # their website, which appears to happen occasionally. Skip.
        has_no_title = (bill.title == "Short Title Not Found.")
        if has_no_title:
            # If there's no title, this is an empty page. Skip!
            return

        else:
            # Otherwise, save the bills.
            yield bill
def test_bill_update_subsubitem():
    create_jurisdiction()
    create_org()
    oi = OrganizationImporter('jid')
    pi = PersonImporter('jid')

    # initial sub-subitem
    bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower')
    bill.add_version_link('printing',
                          'http://example.com/test.pdf',
                          media_type='application/pdf')
    result = BillImporter('jid', oi, pi).import_data([bill.as_dict()])
    assert result['bill']['insert'] == 1
    obj = Bill.objects.get()
    assert obj.versions.count() == 1
    assert obj.versions.get().links.count() == 1

    # a second subsubitem, update
    bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower')
    bill.add_version_link('printing',
                          'http://example.com/test.pdf',
                          media_type='application/pdf')
    bill.add_version_link('printing',
                          'http://example.com/test.text',
                          media_type='text/plain')
    result = BillImporter('jid', oi, pi).import_data([bill.as_dict()])
    assert result['bill']['update'] == 1
    obj = Bill.objects.get()
    assert obj.versions.count() == 1
    assert obj.versions.get().links.count() == 2

    # same thing, noop
    bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower')
    bill.add_version_link('printing',
                          'http://example.com/test.pdf',
                          media_type='application/pdf')
    bill.add_version_link('printing',
                          'http://example.com/test.text',
                          media_type='text/plain')
    result = BillImporter('jid', oi, pi).import_data([bill.as_dict()])
    assert result['bill']['noop'] == 1
    obj = Bill.objects.get()
    assert obj.versions.count() == 1
    assert obj.versions.get().links.count() == 2

    # different link for second one, update
    bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower')
    bill.add_version_link('printing',
                          'http://example.com/test.pdf',
                          media_type='application/pdf')
    bill.add_version_link('printing',
                          'http://example.com/diff-link.txt',
                          media_type='text/plain')
    result = BillImporter('jid', oi, pi).import_data([bill.as_dict()])
    assert result['bill']['update'] == 1
    obj = Bill.objects.get()
    assert obj.versions.count() == 1
    assert obj.versions.get().links.count() == 2

    # delete one, update
    bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower')
    bill.add_version_link('printing',
                          'http://example.com/test.pdf',
                          media_type='application/pdf')
    result = BillImporter('jid', oi, pi).import_data([bill.as_dict()])
    assert result['bill']['update'] == 1
    obj = Bill.objects.get()
    assert obj.versions.count() == 1
    assert obj.versions.get().links.count() == 1
Beispiel #55
0
    def scrape_bill(self, row, session):
        bill_id = row["LegislationDisplayCode"]

        amendment = None
        substitute = None

        if bill_id.count(" ") > 1:
            if " w/ " in bill_id:
                self.info("Found amended bill `{}`".format(bill_id))
                bill_id, amendment = bill_id.split(" w/ ")
            # A bill can _both_ be amended and be substituted
            if " for " in bill_id:
                self.info("Found substitute to use instead: `{}`".format(bill_id))
                substitute, bill_id = bill_id.split(" for ")
            if amendment is None and substitute is None:
                raise ValueError("unknown bill_id format: " + bill_id)

        bill_type = self.classify_bill(bill_id)
        chamber = "upper" if bill_id.startswith("S") else "lower"

        bill = Bill(
            identifier=bill_id,
            legislative_session=session,
            chamber=chamber,
            title=row["LongTitle"],
            classification=bill_type,
        )
        if row["Synopsis"]:
            bill.add_abstract(row["Synopsis"], "synopsis")
        if row["ShortTitle"]:
            bill.add_title(row["ShortTitle"], "short title")
        if row["SponsorPersonId"]:
            self.add_sponsor_by_legislator_id(bill, row["SponsorPersonId"], "primary")
        if substitute:
            bill.extras["substitute"] = substitute
        if amendment:
            bill.extras["amendment"] = amendment

        # TODO: Is there a way get additional sponsors and cosponsors, and versions/fns via API?
        html_url = "https://legis.delaware.gov/BillDetail?LegislationId={}".format(
            row["LegislationId"]
        )
        bill.add_source(html_url, note="text/html")

        html = self.lxmlize(html_url)

        additional_sponsors = html.xpath(
            '//label[text()="Additional Sponsor(s):"]' "/following-sibling::div/a/@href"
        )
        for sponsor_url in additional_sponsors:
            sponsor_id = sponsor_url.replace(
                "https://legis.delaware.gov/LegislatorDetail?" "personId=", ""
            )
            self.add_sponsor_by_legislator_id(bill, sponsor_id, "primary")

        cosponsors = html.xpath(
            '//label[text()="Co-Sponsor(s):"]/' "following-sibling::div/a/@href"
        )
        for sponsor_url in cosponsors:
            sponsor_id = sponsor_url.replace(
                "https://legis.delaware.gov/LegislatorDetail?" "personId=", ""
            )
            self.add_sponsor_by_legislator_id(bill, sponsor_id, "cosponsor")

        versions = html.xpath(
            '//label[text()="Original Text:"]/following-sibling::div/a/@href'
        )
        for version_url in versions:
            media_type = self.mime_from_link(version_url)
            version_name = "Bill Text"
            bill.add_version_link(version_name, version_url, media_type=media_type)

        fiscals = html.xpath('//div[contains(@class,"fiscalNote")]/a/@href')
        for fiscal in fiscals:
            self.scrape_fiscal_note(bill, fiscal)

        self.scrape_actions(bill, row["LegislationId"])

        if row["HasAmendments"] is True:
            self.scrape_amendments(bill, row["LegislationId"])

        yield from self.scrape_votes(bill, row["LegislationId"], session)

        yield bill
Beispiel #56
0
    def scrape_bill_list(self, url):
        bill_list = self._get_bill_list(url)

        for bill_info in bill_list:

            (bill_id, ) = bill_info.xpath('td[1]/font/input/@value')
            (sponsor, ) = bill_info.xpath('td[2]/font/input/@value')
            (subject, ) = bill_info.xpath('td[3]//text()')
            subject = subject.strip()
            chamber = self.CHAMBERS[bill_id[0]]

            if 'B' in bill_id:
                bill_type = 'bill'
            elif 'JR' in bill_id:
                bill_type = 'joint resolution'
            elif 'R' in bill_id:
                bill_type = 'resolution'
            else:
                raise AssertionError(
                    "Unknown bill type for bill '{}'".format(bill_id))

            bill = Bill(
                bill_id,
                legislative_session=self.session,
                chamber=chamber,
                title='',
                classification=bill_type,
            )
            if subject:
                bill.subject = [subject]
            if sponsor:
                bill.add_sponsorship(
                    name=sponsor,
                    entity_type='person',
                    classification='primary',
                    primary=True,
                )
            bill.add_source(url)

            bill_url = ('http://alisondb.legislature.state.al.us/Alison/'
                        'SESSBillStatusResult.aspx?BILL={}'.format(bill_id))
            bill.add_source(bill_url)

            bill_html = self._get_bill_response(bill_url)
            if bill_html is None:
                self.warning(
                    "Bill {} has no webpage, and will be skipped".format(
                        bill_id))
                continue
            bill_doc = lxml.html.fromstring(bill_html)

            if (bill_doc.xpath(
                    '//span[@id="ContentPlaceHolder1_lblShotTitle"]')):
                title = bill_doc.xpath(
                    '//span[@id="ContentPlaceHolder1_lblShotTitle"]'
                )[0].text_content().strip()
            if not title:
                title = "[No title given by state]"
            bill.title = title

            version_url_base = (
                'http://alisondb.legislature.state.al.us/ALISON/'
                'SearchableInstruments/{0}/PrintFiles/{1}-'.format(
                    self.session, bill_id))
            versions = bill_doc.xpath(
                '//table[@class="box_versions"]/tr/td[2]/font/text()')
            for version in versions:
                name = version
                if version == "Introduced":
                    version_url = version_url_base + 'int.pdf'
                elif version == "Engrossed":
                    version_url = version_url_base + 'eng.pdf'
                elif version == "Enrolled":
                    version_url = version_url_base + 'enr.pdf'
                else:
                    raise NotImplementedError(
                        "Unknown version type found: '{}'".format(name))

                bill.add_version_link(
                    name,
                    version_url,
                    media_type='application/pdf',
                    on_duplicate='ignore',
                )

            # Fiscal notes exist, but I can't figure out how to build their URL
            fiscal_notes = bill_doc.xpath(
                '//table[@class="box_fiscalnote"]')[1:]
            for fiscal_note in fiscal_notes:
                pass

            # Budget Isolation Resolutions are handled as extra actions/votes
            birs = bill_doc.xpath(
                '//div[@class="box_bir"]//table//table/tr')[1:]
            for bir in birs:
                bir_action = bir.xpath('td[1]')[0].text_content().strip()
                # Sometimes ALISON's database puts another bill's
                # actions into the BIR action list; ignore these
                if bill_id not in bir_action:
                    self.warning(
                        "BIR action found ({}) ".format(bir_action) +
                        "that doesn't match the bill ID ({})".format(bill_id))
                    continue

                bir_date = datetime.datetime.strptime(
                    bir.xpath('td[2]/font/text()')[0], self.DATE_FORMAT)
                bir_type = bir.xpath('td[1]/font/text()')[0].split(" ")[0]
                bir_chamber = self.CHAMBERS[bir_type[0]]
                bir_text = "{0}: {1}".format(
                    bir_type,
                    bir.xpath('td[3]/font/text()')[0].strip())

                bill.add_action(
                    bir_text,
                    TIMEZONE.localize(bir_date),
                    chamber=bir_chamber,
                    classification='other',
                )

                try:
                    (bir_vote_id, ) = bir.xpath('td[4]/font/input/@value')
                except ValueError:
                    bir_vote_id = ''

                bir_vote_id = bir_vote_id.strip()
                if bir_vote_id.startswith("Roll "):
                    bir_vote_id = bir_vote_id.split(" ")[-1]

                    yield from self.scrape_vote(
                        bill=bill,
                        vote_chamber=bir_type[0],
                        bill_id="{0}%20for%20{1}".format(bir_type, bill_id),
                        vote_id=bir_vote_id,
                        vote_date=TIMEZONE.localize(bir_date),
                        action_text=bir_text)

            actions = bill_doc.xpath(
                '//table[@id="ContentPlaceHolder1_gvHistory"]/tr')[1:]
            action_date = None
            for action in actions:
                # If actions occur on the same day, only one date will exist
                if (action.xpath('td[1]/font/text()')[0].encode(
                        'ascii', 'ignore').strip()):
                    action_date = datetime.datetime.strptime(
                        action.xpath('td[1]/font/text()')[0], self.DATE_FORMAT)

                (action_chamber, ) = action.xpath('td[2]/font/text()')

                if action.xpath('td[3]/font/u/text()'):
                    (amendment, ) = action.xpath('td[3]/font/u/text()')
                else:
                    amendment = None

                (action_text, ) = action.xpath('td[4]/font/text()')

                action_type = _categorize_action(action_text)

                # check for occasional extra last row
                if not action_chamber.strip():
                    continue

                # The committee cell is just an abbreviation, so get its name
                actor = self.CHAMBERS[action_chamber]
                try:
                    action_committee = re.search(
                        r'.*? referred to the .*? committee on (.*?)$',
                        action_text).group(1).strip()
                except AttributeError:
                    action_committee = ''

                act = bill.add_action(
                    action_text,
                    TIMEZONE.localize(action_date),
                    chamber=actor,
                    classification=action_type,
                )
                if action_committee:
                    act.add_related_entity(action_committee,
                                           entity_type='organization')

                try:
                    vote_button = action.xpath('td[9]//text()')[0].strip()
                except IndexError:
                    vote_button = ''

                if vote_button.startswith("Roll "):
                    vote_id = vote_button.split(" ")[-1]

                    yield from self.scrape_vote(
                        bill=bill,
                        vote_chamber=action_chamber,
                        bill_id=bill_id,
                        vote_id=vote_id,
                        vote_date=TIMEZONE.localize(action_date),
                        action_text=action_text)

                if amendment:
                    amend_url = (
                        'http://alisondb.legislature.state.al.us/ALISON/'
                        'SearchableInstruments/{0}/PrintFiles/{1}.pdf'.format(
                            self.session, amendment))

                    amend_name = 'Amd/Sub {}'.format(amendment)

                    bill.add_version_link(
                        amend_name,
                        amend_url,
                        media_type='application/pdf',
                        on_duplicate='ignore',
                    )

            yield bill
    def scrape(self) :
        three_days_ago = datetime.datetime.now() - datetime.timedelta(3)
        for matter in self.matters(three_days_ago) :
            matter_id = matter['MatterId']

            date = matter['MatterIntroDate']
            title = matter['MatterTitle']
            identifier = matter['MatterFile']

            if not all((date, title, identifier)) :
                continue

            bill_session = self.session(self.toTime(date))
            bill_type = BILL_TYPES[matter['MatterTypeName']]

            if identifier.startswith('S'):
                alternate_identifiers = [identifier]
                identifier = identifier[1:]
            else:
                alternate_identifiers = []

            bill = Bill(identifier=identifier,
                        legislative_session=bill_session,
                        title=title,
                        classification=bill_type,
                        from_organization={"name":"Chicago City Council"})

            legistar_web = self.legislation_detail_url(matter_id)
            legistar_api = 'http://webapi.legistar.com/v1/chicago/matters/{0}'.format(matter_id)

            bill.add_source(legistar_web, note='web')
            bill.add_source(legistar_api, note='api')

            for identifier in alternate_identifiers:
                bill.add_identifier(identifier)

            for action, vote in self.actions(matter_id) :
                act = bill.add_action(**action)

                if action['description'] == 'Referred' :
                    body_name = matter['MatterBodyName']
                    if body_name != 'City Council' :
                        act.add_related_entity(body_name,
                                               'organization',
                                               entity_id = _make_pseudo_id(name=body_name))

                result, votes = vote
                if result :
                    vote_event = VoteEvent(legislative_session=bill.legislative_session, 
                                           motion_text=action['description'],
                                           organization=action['organization'],
                                           classification=None,
                                           start_date=action['date'],
                                           result=result,
                                           bill=bill)

                    vote_event.add_source(legistar_web)
                    vote_event.add_source(legistar_api + '/histories')

                    for vote in votes :
                        raw_option = vote['VoteValueName'].lower()
                        clean_option = self.VOTE_OPTIONS.get(raw_option,
                                                             raw_option)
                        vote_event.vote(clean_option, 
                                        vote['VotePersonName'].strip())

                    yield vote_event


            for sponsorship in self.sponsorships(matter_id) :
                bill.add_sponsorship(**sponsorship)

            for topic in self.topics(matter_id) :
                bill.add_subject(topic['MatterIndexName'].strip())

            for attachment in self.attachments(matter_id) :
                if attachment['MatterAttachmentName'] :
                    bill.add_version_link(attachment['MatterAttachmentName'],
                                          attachment['MatterAttachmentHyperlink'],
                                          media_type="application/pdf")

            bill.extras = {'local_classification' : matter['MatterTypeName']}

            text = self.text(matter_id)

            if text :
                if text['MatterTextPlain'] :
                    bill.extras['plain_text'] = text['MatterTextPlain']

                if text['MatterTextRtf'] :
                    bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '')

            yield bill