Exemple #1
0
    def scrape_bill(self, chamber, session):
        url = "ftp://www.arkleg.state.ar.us/dfadooas/LegislativeMeasures.txt"
        page = self.urlopen(url)
        page = unicode_csv_reader(StringIO.StringIO(page), delimiter='|')

        for row in page:
            bill_chamber = {'H': 'lower', 'S': 'upper'}[row[0]]
            if bill_chamber != chamber:
                continue

            bill_id = "%s%s %s" % (row[0], row[1], row[2])

            type_spec = re.match(r'(H|S)([A-Z]+)\s', bill_id).group(2)
            bill_type = {
                'B': 'bill',
                'R': 'resolution',
                'JR': 'joint resolution',
                'CR': 'concurrent resolution',
                'MR': 'memorial resolution',
                'CMR': 'concurrent memorial resolution'}[type_spec]

            if row[-1] != self.slug:
                continue

            bill = Bill(session, chamber, bill_id, row[3], type=bill_type)
            bill.add_source(url)

            primary = row[11]
            if not primary:
                primary = row[12]
            bill.add_sponsor('primary', primary)

            version_url = ("ftp://www.arkleg.state.ar.us/Bills/"
                           "%s/Public/%s.pdf" % (
                               session, bill_id.replace(' ', '')))
            bill.add_version(bill_id, version_url, mimetype='application/pdf')

            self.scrape_bill_page(bill)

            self.bills[bill_id] = bill
Exemple #2
0
    def scrape_bill(self, chamber, session, bill_id, url):
        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)

        title = doc.xpath('//h3[@class="h3billright"]')[0].text
        # TODO: grab summary (none present at time of writing)

        if 'B' in bill_id:
            _type = ['bill']
        elif 'J' in bill_id:
            _type = ['joint resolution']
        else:
            raise ValueError('unknown bill type ' + bill_id)

        bill = Bill(session, chamber, bill_id, title, type=_type)
        bill.add_source(url)

        # process sponsors
        sponsors = _get_td(doc, 'All Sponsors:').text_content()
        sponsors = sponsors.replace('Delegates ', '')
        sponsors = sponsors.replace('Delegate ', '')
        sponsors = sponsors.replace('Senator ', '')
        sponsors = sponsors.replace('Senators ', '')
        sponsor_type = 'primary'
        for sponsor in re.split(', (?:and )?', sponsors):
            #self.debug('sponsor: %s', sponsor)
            bill.add_sponsor(sponsor_type, sponsor)
            sponsor_type = 'cosponsor'

        # subjects
        subjects = _get_td(doc, 'Narrow Subject(s):').xpath('a/text()')
        bill['subjects'] = [s.split(' -see also-')[0] for s in subjects if s]

        # documents
        self.scrape_documents(bill, url.replace('stab=01', 'stab=02'))
        # actions
        self.scrape_actions(bill, url.replace('stab=01', 'stab=03'))

        self.save_bill(bill)
Exemple #3
0
    def parse_bill(self, chamber, session, special, link):
        bill_num = link.text.strip()
        type_abbr = re.search('type=(B|R|)', link.attrib['href']).group(1)

        if type_abbr == 'B':
            btype = ['bill']
        elif type_abbr == 'R':
            btype = ['resolution']

        bill_id = "%s%s %s" % (bill_abbr(chamber), type_abbr, bill_num)

        url = info_url(chamber, session, special, type_abbr, bill_num)
        page = self.urlopen(url)
        page = lxml.html.fromstring(page)
        page.make_links_absolute(url)

        xpath = '//div[contains(@class, "BillInfo-ShortTitle")]/div[@class="BillInfo-Section-Data"]'
        title = page.xpath(xpath).pop().text_content().strip()
        if not title:
            return
        bill = Bill(session, chamber, bill_id, title, type=btype)
        bill.add_source(url)

        self.parse_bill_versions(bill, page)

        self.parse_history(
            bill, history_url(chamber, session, special, type_abbr, bill_num))

        # only fetch votes if votes were seen in history
        # if vote_count:
        self.parse_votes(
            bill, vote_url(chamber, session, special, type_abbr, bill_num))

        # Dedupe sources.
        sources = bill['sources']
        for source in sources:
            if 1 < sources.count(source):
                sources.remove(source)

        self.save_bill(bill)
Exemple #4
0
    def parse_senate_billpage(self, bill_url, year):
        with self.urlopen(bill_url) as bill_page:
            bill_page = BeautifulSoup(bill_page)
            # get all the info needed to record the bill
            bill_id = bill_page.find(id="lblBillNum").b.font.contents[0]
            bill_title = bill_page.find(id="lblBillTitle").font.string
            bill_desc = bill_page.find(id="lblBriefDesc").font.contents[0]
            bill_lr = bill_page.find(id="lblLRNum").font.string

            bill = Bill(year,
                        'upper',
                        bill_id,
                        bill_desc,
                        bill_url=bill_url,
                        bill_lr=bill_lr,
                        official_title=bill_title)
            bill.add_source(bill_url)

            # Get the primary sponsor
            bill_sponsor = bill_page.find(id="hlSponsor").i.font.contents[0]
            bill_sponsor_link = bill_page.find(id="hlSponsor").href
            bill.add_sponsor('primary',
                             bill_sponsor,
                             sponsor_link=bill_sponsor_link)

            # cosponsors show up on their own page, if they exist
            cosponsor_tag = bill_page.find(id="hlCoSponsors")
            if cosponsor_tag and 'href' in cosponsor_tag:
                self.parse_senate_cosponsors(bill, cosponsor_tag['href'])

            # get the actions
            action_url = bill_page.find(id="hlAllActions")['href']
            self.parse_senate_actions(bill, action_url)

            # stored on a separate page
            versions_url = bill_page.find(id="hlFullBillText")
            if versions_url:
                self.parse_senate_bill_versions(bill, versions_url['href'])

        self.save_bill(bill)
Exemple #5
0
    def parse_bill_status_page(self, status_url, bill_url, session, chamber):
        status_page = ElementTree(
            lxml.html.fromstring(self.urlopen(status_url)))
        # see 2007 HB 2... weird.
        try:
            bill_id = status_page.xpath("//tr[2]/td[2]")[0].text_content()
        except IndexError:
            bill_id = status_page.xpath('//tr[1]/td[2]')[0].text_content()

        try:
            title = status_page.xpath(
                "//form[1]/table[2]/tr[3]/td[2]")[0].text_content()
        except IndexError:
            title = status_page.xpath('//tr[1]/td[2]')[0].text_content()

        bill = Bill(session, chamber, bill_id, title)
        bill.add_source(bill_url)

        self.add_sponsors(bill, status_page)
        self.add_actions(bill, status_page)

        return bill
Exemple #6
0
    def scrape(self, chamber, session):
        try:
            for index in xrange(1, 1000):
                url = ("http://open.nysenate.gov/legislation/search/"
                       "?search=otype:bill&searchType=&format=xml"
                       "&pageIdx=%d" % index)
                with self.urlopen(url) as page:
                    page = lxml.etree.fromstring(page)

                    for result in page.xpath("//result[@type = 'bill']"):
                        id = result.attrib['id'].split('-')[0]

                        title = result.attrib['title'].strip()
                        if title == '(no title)':
                            continue

                        primary_sponsor = result.attrib['sponsor']

                        if id.startswith('S'):
                            bill_chamber = 'upper'
                        else:
                            bill_chamber = 'lower'

                        if chamber != bill_chamber:
                            continue

                        bill = Bill(session, chamber, id, title)
                        bill.add_source(url)
                        bill.add_sponsor('primary', primary_sponsor)

                        bill_url = ("http://open.nysenate.gov/legislation/"
                                    "bill/%s" % result.attrib['id'])
                        self.scrape_bill(bill, bill_url)
                        bill.add_source(bill_url)

                        self.save_bill(bill)
        except scrapelib.HTTPError as e:
            if e.response.code != 404:
                raise
Exemple #7
0
    def parse_senate_billpage(self, bill_url, year):
        with self.urlopen(bill_url) as bill_page:
            bill_page = lxml.html.fromstring(bill_page)
            # get all the info needed to record the bill
            # TODO probably still needs to be fixed
            bill_id = bill_page.xpath('//*[@id="lblBillNum"]')[0].text_content()
            bill_title = bill_page.xpath('//*[@id="lblBillTitle"]')[0].text_content()
            bill_desc = bill_page.xpath('//*[@id="lblBriefDesc"]')[0].text_content()
            bill_lr = bill_page.xpath('//*[@id="lblLRNum"]')[0].text_content()
            #print "bill id = "+ bill_id

            bill = Bill(year, 'upper', bill_id, bill_desc, bill_url=bill_url,
                        bill_lr=bill_lr, official_title=bill_title)
            bill.add_source(bill_url)

            # Get the primary sponsor
            sponsor = bill_page.xpath('//*[@id="hlSponsor"]')[0]
            bill_sponsor = sponsor.text_content()
            bill_sponsor_link = sponsor.attrib.get('href')
            bill.add_sponsor('primary', bill_sponsor, sponsor_link=bill_sponsor_link)

            # cosponsors show up on their own page, if they exist
            cosponsor_tag = bill_page.xpath('//*[@id="hlCoSponsors"]')
            if len(cosponsor_tag) > 0 and cosponsor_tag[0].attrib.has_key('href'):
                self.parse_senate_cosponsors(bill, cosponsor_tag[0].attrib['href'])

            # get the actions
            action_url = bill_page.xpath('//*[@id="hlAllActions"]')
            if len(action_url) > 0:
                action_url =  action_url[0].attrib['href']
                #print "actions = %s" % action_url
                self.parse_senate_actions(bill, action_url)

            # stored on a separate page
            versions_url = bill_page.xpath('//*[@id="hlFullBillText"]')
            if len(versions_url) > 0 and versions_url[0].attrib.has_key('href'):
                self.parse_senate_bill_versions(bill, versions_url[0].attrib['href'])

        self.save_bill(bill)
Exemple #8
0
    def scrape(self, chamber, session):
        chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber]

        url = ("http://legisweb.state.wy.us/%s/billreference/"
               "BillReference.aspx?type=%s" % (session, chamber_abbrev))
        page = self.lxmlize(url)

        for tr in page.xpath("//table[contains(@id,'cphContent_gvBills')]//tr")[1:]:
            bill_id = tr.xpath("string(td[1])").strip()
            title = tr.xpath("string(td[2])").strip()

            if bill_id[0:2] in ['SJ', 'HJ']:
                bill_type = 'joint resolution'
            else:
                bill_type = 'bill'

            bill = Bill(session, chamber, bill_id, title, type=bill_type)

            self.scrape_digest(bill)

            # versions
            for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') +
                      tr.xpath('td[12]//a')):
                # skip references to other bills
                if a.text.startswith('See'):
                    continue
                bill.add_version(a.text, a.get('href'),
                                 mimetype='application/pdf')

            # documents
            fnote = tr.xpath('td[9]//a')
            if fnote:
                bill.add_document('Fiscal Note', fnote[0].get('href'))
            summary = tr.xpath('td[14]//a')
            if summary:
                bill.add_document('Summary', summary[0].get('href'))

            bill.add_source(url)
            self.save_bill(bill)
Exemple #9
0
    def scrape_bill(self, chamber, session, bill_type, number):
        """ Creates a bill object
        """
        if len(session) == 4:
            session_url = session+'rs'
        else:
            session_url = session
        url = BILL_URL % (session_url, bill_type, number)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)
            # find <a name="Title">, get parent dt, get parent dl, then dd n dl
            title = doc.xpath('//a[@name="Title"][1]/../../dd[1]/text()')[0].strip()

            synopsis = doc.xpath('//font[@size="3"]/p/text()')[0].strip()

            #print "%s %d %s" % (bill_type, number, title)

            if 'B' in bill_type:
                _type = ['bill']
            elif 'J' in bill_type:
                _type = ['joint resolution']

            bill = Bill(session, chamber, "%s %d" % (bill_type, number), title,
                        type=_type, synopsis=synopsis)
            bill.add_source(url)

            self.parse_bill_sponsors(doc, bill)     # sponsors
            self.parse_bill_actions(doc, bill)      # actions
            self.parse_bill_documents(doc, bill)    # documents and versions
            self.parse_bill_votes(doc, bill)        # votes

            # subjects
            subjects = []
            for subj in doc.xpath('//a[contains(@href, "/subjects/")]'):
                subjects.append(subj.text.split('-see also-')[0])
            bill['subjects'] = subjects

            # add bill to collection
            self.save_bill(bill)
Exemple #10
0
    def scrape_bill(self, session, chamber, bill_url):

        try:
            page = self.lxmlize('{}{}'.format(CO_URL_BASE, bill_url))
        except scrapelib.HTTPError as e:
            if e.response.status_code == 503:
                self.error('Skipping %s w/ 503', bill_url)
                return
            else:
                raise

        bill_number = page.xpath(
            '//div[contains(@class,"field-name-field-bill-number")]'
            '//div[contains(@class,"field-item even")][1]/text()')[0].strip()

        bill_title = page.xpath('//span[@property="dc:title"]/@content')[0]

        bill_summary = page.xpath(
            'string(//div[contains(@class,"field-name-field-bill-summary")])')
        bill_summary = bill_summary.strip()

        bill = Bill(session,
                    chamber,
                    bill_number,
                    bill_title,
                    summary=bill_summary)

        bill.add_source('{}{}'.format(CO_URL_BASE, bill_url))

        self.scrape_sponsors(bill, page)
        self.scrape_actions(bill, page)
        self.scrape_versions(bill, page)
        self.scrape_research_notes(bill, page)
        self.scrape_fiscal_notes(bill, page)
        self.scrape_committee_report(bill, page)
        self.scrape_votes(bill, page)
        self.scrape_amendments(bill, page)

        self.save_bill(bill)
Exemple #11
0
    def scrape_bill(self, chamber, session, bill_id):
        biennium = "%s-%s" % (session[0:4], session[7:9])
        bill_num = bill_id.split()[1]

        url = ("%s/GetLegislation?biennium=%s&billNumber"
               "=%s" % (self._base_url, biennium, bill_num))

        with self.urlopen(url) as page:
            page = lxml.etree.fromstring(page)
            page = xpath(page, "//wa:Legislation")[0]

            title = xpath(page, "string(wa:LongDescription)")

            bill_type = xpath(
                page, "string(wa:ShortLegislationType/wa:LongLegislationType)")
            bill_type = bill_type.lower()

            if bill_type == 'gubernatorial appointment':
                return

            bill = Bill(session, chamber, bill_id, title, type=[bill_type])

            chamber_name = {'lower': 'House', 'upper': 'Senate'}[chamber]
            version_url = ("http://www.leg.wa.gov/pub/billinfo/2011-12/"
                           "Htm/Bills/%s %ss/%s.htm" %
                           (chamber_name, bill_type.title(), bill_num))
            bill.add_version(bill_id, version_url)

            fake_source = ("http://apps.leg.wa.gov/billinfo/"
                           "summary.aspx?bill=%s&year=%s" %
                           (bill_num, session[0:4]))
            bill.add_source(fake_source)

            self.scrape_sponsors(bill)
            self.scrape_actions(bill)
            self.scrape_votes(bill)

            return bill
Exemple #12
0
    def scrape(self, chamber, session):
        chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber]

        url = ("http://legisweb.state.wy.us/%s/billindex/"
               "BillCrossRef.aspx?type=%s" % (session, chamber_abbrev))
        page = lxml.html.fromstring(self.urlopen(url))

        for tr in page.xpath("//tr[@valign='middle']")[1:]:
            bill_id = tr.xpath("string(td[1])").strip()
            title = tr.xpath("string(td[2])").strip()

            if bill_id[0:2] in ['SJ', 'HJ']:
                bill_type = 'joint resolution'
            else:
                bill_type = 'bill'

            bill = Bill(session, chamber, bill_id, title, type=bill_type)

            self.scrape_digest(bill)

            # versions
            for a in (tr.xpath('td[6]//a') + tr.xpath('td[9]//a') +
                      tr.xpath('td[10]//a')):
                # skip references to other bills
                if a.text.startswith('See'):
                    continue
                bill.add_version(a.text, a.get('href'))

            # documents
            fnote = tr.xpath('td[7]//a')
            if fnote:
                bill.add_document('Fiscal Note', fnote[0].get('href'))
            summary = tr.xpath('td[12]//a')
            if summary:
                bill.add_document('Summary', summary[0].get('href'))

            bill.add_source(url)
            self.save_bill(bill)
Exemple #13
0
    def scrape_bill_info(self, session, chambers):
        info_url = "ftp://ftp.cga.ct.gov/pub/data/bill_info.csv"
        data = self.urlopen(info_url)
        page = open_csv(data)

        chamber_map = {'H': 'lower', 'S': 'upper'}

        for row in page:
            bill_id = row['bill_num']
            chamber = chamber_map[bill_id[0]]

            if not chamber in chambers:
                continue

            # assert that the bill data is from this session, CT is tricky
            assert row['sess_year'] == session

            if re.match(r'^(S|H)J', bill_id):
                bill_type = 'joint resolution'
            elif re.match(r'^(S|H)R', bill_id):
                bill_type = 'resolution'
            else:
                bill_type = 'bill'

            bill = Bill(session, chamber, bill_id,
                        row['bill_title'],
                        type=bill_type)
            bill.add_source(info_url)

            self.scrape_bill_page(bill)

            for introducer in self._introducers[bill_id]:
                bill.add_sponsor('primary', introducer,
                                 official_type='introducer')

            bill['subjects'] = self._subjects[bill_id]

            self.bills[bill_id] = bill
Exemple #14
0
    def scrape_bill_2012(self, chamber, session, bill_id, url):
        """ Creates a bill object """
        if len(session) == 4:
            session_url = session+'rs'
        else:
            session_url = session

        html = self.urlopen(url)
        doc = lxml.html.fromstring(html)
        doc.make_links_absolute(url)
        # find <a name="Title">, get parent dt, get parent dl, then dd n dl
        title = doc.xpath('//a[@name="Title"][1]/../../dd[1]/text()')[0].strip()

        summary = doc.xpath('//font[@size="3"]/p/text()')[0].strip()

        if 'B' in bill_id:
            _type = ['bill']
        elif 'J' in bill_id:
            _type = ['joint resolution']

        bill = Bill(session, chamber, bill_id, title, type=_type,
                    summary=summary)
        bill.add_source(url)

        self.parse_bill_sponsors(doc, bill)     # sponsors
        self.parse_bill_actions(doc, bill)      # actions
        self.parse_bill_documents(doc, bill)    # documents and versions
        self.parse_bill_votes(doc, bill)        # votes

        # subjects
        subjects = []
        for subj in doc.xpath('//a[contains(@href, "/subjects/")]'):
            subjects.append(subj.text.split('-see also-')[0])
        bill['subjects'] = subjects

        # add bill to collection
        self.save_bill(bill)
Exemple #15
0
    def scrape_bill(self, session, chamber, bill_type, url):
        bill_html = self.urlopen(url)
        bill_page = lxml.html.fromstring(bill_html)
        scraped_bill_id = bill_page.xpath(
            "//a[contains(@id, 'LinkButtonMeasure')]")[0].text_content()
        bill_id = scraped_bill_id.split(' ')[0]
        versions = bill_page.xpath( "//table[contains(@id, 'GridViewVersions')]" )[0]

        tables = bill_page.xpath("//table")
        metainf_table = bill_page.xpath('//div[contains(@id, "itemPlaceholder")]//table[1]')[0]
        action_table  = bill_page.xpath('//div[contains(@id, "UpdatePanel1")]//table[1]')[0]

        meta  = self.parse_bill_metainf_table(metainf_table)

        subs = [ s.strip() for s in meta['Report Title'].split(";") ]
        if "" in subs:
            subs.remove("")

        b = Bill(session, chamber, bill_id, title=meta['Measure Title'],
                 summary=meta['Description'],
                 referral=meta['Current Referral'],
                 subjects=subs,
                 type=bill_type)
        b.add_source(url)

        companion = meta['Companion'].strip()
        if companion:
            b['companion'] = companion

        for sponsor in meta['Introducer(s)']:
            b.add_sponsor(type='primary', name=sponsor)

        actions  = self.parse_bill_actions_table(b, action_table)
        versions = self.parse_bill_versions_table(b, versions)

        self.save_bill(b)
Exemple #16
0
    def parse_bill(self, chamber, session, special, link):
        bill_num = link.text.strip()
        type_abbr = re.search('type=(B|R|)', link.attrib['href']).group(1)

        if type_abbr == 'B':
            btype = ['bill']
        elif type_abbr == 'R':
            btype = ['resolution']

        bill_id = "%s%s %s" % (bill_abbr(chamber), type_abbr, bill_num)

        url = info_url(chamber, session, special, type_abbr, bill_num)
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            title = page.xpath(
                "//td[text() = 'Short Title:']/following-sibling::td")[0]
            title = title.text.strip()

            bill = Bill(session, chamber, bill_id, title, type=btype)
            bill.add_source(url)

            self.parse_bill_versions(bill, page)

            vote_count = self.parse_history(
                bill,
                history_url(chamber, session, special, type_abbr, bill_num))

            # only fetch votes if votes were seen in history
            if vote_count:
                self.parse_votes(
                    bill,
                    vote_url(chamber, session, special, type_abbr, bill_num))

            self.save_bill(bill)
Exemple #17
0
    def scrape_bill(self,
                    url,
                    kw,
                    re_amendment=re.compile(r'(^[A-Z]A \d{1,3}) to'),
                    re_substitution=re.compile(r'(^[A-Z]S \d{1,2}) for'),
                    re_digits=re.compile(r'\d{,5}'),
                    actions_categorize=actions.categorize,
                    actions_get_actor=actions.get_actor):

        bill = Bill(**kw)
        bill.add_source(url)

        #---------------------------------------------------------------------
        # A few helpers.
        _url_2_lxml = self._url_2_lxml
        _cleanup_sponsors = self._cleanup_sponsors

        # Shortcut function partial to get text at a particular xpath:
        doc = _url_2_lxml(url)
        _get_text = partial(get_text, doc, 0)

        # Get session number--needed for fetching related documents (see below).
        xpath = '//font[contains(., "General Assembly") and @face="Arial"]'
        session_num = doc.xpath(xpath)[0].text_content()
        session_num = re_digits.match(session_num).group()

        #---------------------------------------------------------------------
        # Sponsors
        chamber = bill['chamber']

        sponsor_types = {
            'Additional Sponsor(s):': 'cosponsor',
            'CoSponsors:': 'cosponsor',
            'Primary Sponsor:': 'primary'
        }

        xpath = '//font[contains(., "Sponsor") and @color="#008080"]'
        headings = doc.xpath(xpath + '/text()')
        sponsors = doc.xpath(xpath +
                             '/../../following-sibling::td/font/text()')

        for h, s in zip(headings, sponsors):

            names = _cleanup_sponsors(s, chamber)
            type_ = sponsor_types[h.strip()]

            if names:
                for name, _chamber in names:
                    bill.add_sponsor(type_, name, chamber=_chamber)

        #---------------------------------------------------------------------
        # Versions

        tmp = '/'.join([
            'http://www.legis.delaware.gov',
            'LIS/lis{session_num}.nsf/vwLegislation',
            '{moniker}/$file/{filename}{format_}?open'
        ])

        documents = self.scrape_documents(source=url,
                                          docname="introduced",
                                          filename="Legis",
                                          tmp=tmp,
                                          session_num=session_num)

        for d in documents:
            bill.add_version(**d)

        # If bill is a substitution, add the original as a version.
        names = doc.xpath('//*[contains(text(), "Substituted '
                          'Legislation for Bill:")]/text()')
        urls = doc.xpath('//*[contains(text(), "Substituted '
                         'Legislation for Bill:")]'
                         '/following-sibling::a/@href')

        for name, url in zip(names, urls):

            name = re_substitution.match(name).group(1)
            bill.add_version(name, url, description='original bill')

        #---------------------------------------------------------------------
        # Actions
        actions = doc.xpath('//font[contains(., "Actions History")]'
                            '/../following-sibling::table/descendant::td[2]')
        actions = actions[0].text_content()
        actions = filter(None, actions.splitlines())

        for a in reversed(actions):
            date, action = a.split(' - ', 1)
            try:
                date = datetime.strptime(date, '%b %d, %Y')
            except ValueError:
                date = datetime.strptime(date, '%B %d, %Y')  # XXX: ugh.

            actor = actions_get_actor(action, bill['chamber'])
            type_ = actions_categorize(action)
            bill.add_action(actor, action, date, type_)

        #---------------------------------------------------------------------
        # Votes
        vote_strings = doc.xpath('//*[contains(text(), "vote:")]/text()')

        # Sometimes vote strings are contained in weird, separate elements. Probably
        # hand edited.
        if not all(re.search('\d', string) for string in vote_strings):
            # Use the parent's text_content instead.
            vote_strings = []
            for el in doc.xpath('//*[contains(text(), "vote:")]/..'):
                vote_strings.append(el.text_content())

        vote_urls = doc.xpath('//*[contains(text(), "vote:")]'
                              '/following-sibling::a/@href')
        for string, url in zip(vote_strings, vote_urls):

            vote_data = parse_votestring(string)
            vote = self.scrape_vote(url, **vote_data)
            if vote:
                bill.add_vote(vote)

        #---------------------------------------------------------------------
        # Amendments
        xpath = ("//font[contains(., 'Amendments')]/" "../../../td[2]/font/a")

        tmp = ('http://www.legis.delaware.gov/LIS/lis{session_num}.nsf/'
               'vwLegislation/{id_}/$file/{filename}{format_}?open')

        for source, id_ in zip(doc.xpath(xpath + '/@href'),
                               doc.xpath(xpath + '/text()')):

            short_id = re_amendment.match(id_).group(1)

            documents = self.scrape_documents(source=source,
                                              docname='amendment (%s)' %
                                              short_id,
                                              filename='Legis',
                                              tmp=tmp,
                                              session_num=session_num,
                                              id_=id_)

            for d in documents:
                bill.add_document(**d)

        #---------------------------------------------------------------------
        # Add any related "Engrossments".
        # See www.ncsl.org/documents/legismgt/ILP/98Tab3Pt4.pdf for
        # an explanation of the engrossment process in DE.
        source = doc.xpath('//img[@alt="Engrossment"]/../@href')

        if source:

            tmp = '/'.join([
                'http://www.legis.delaware.gov',
                'LIS/lis{session_num}.nsf/EngrossmentsforLookup',
                '{moniker}/$file/{filename}{format_}?open'
            ])

            documents = self.scrape_documents(source=source[0],
                                              docname="Engrossment",
                                              filename="Engross",
                                              tmp=tmp,
                                              session_num=session_num,
                                              id_=bill['bill_id'])

            for d in documents:
                bill.add_version(**d)

        # --------------------------------------------------------------------
        # Add any fiscal notes.
        source = doc.xpath("//img[@alt='Fiscal Note']/../@href")

        if source:

            tmp = '/'.join([
                'http://www.legis.delaware.gov',
                'LIS/lis{session_num}.nsf/FiscalforLookup',
                '{docnum}/$file/{filename}{format_}?open'
            ])

            documents = self.scrape_documents(source=source[0],
                                              docname="Fiscal Note",
                                              filename="Fiscal",
                                              tmp=tmp,
                                              session_num=session_num)

            for d in documents:
                bill.add_document(**d)

        #---------------------------------------------------------------------
        # Extra fields

        # Helper to get the first td sibling of certain nodes.
        tmp = '//font[contains(., "%s")]/../../../td[2]'
        first_sibling_text = lambda heading: _get_text(tmp % heading)

        extra_fields = {
            # A long description of the legislation.
            "summary": "Synopsis",
            # Codification details for enacted legislation.
            "volume_chapter": "Volume Chapter",
            # Presumably the date of approval/veto.
            "date_governor_acted": "Date Governor Acted",
            "fiscal_notes": "Fiscal Notes",
        }

        for key, name in extra_fields.iteritems():
            try:
                bill[key] = first_sibling_text(name)
            except IndexError:
                # xpath lookup failed.
                pass

        self.save_bill(bill)
Exemple #18
0
    def scrape_bill_pages(self, session, year_abr):
        """ assemble information on a bill from a number of DBF files
        """

        #Main Bill information
        main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            # some bills have a blank title.. just skip it
            if not title:
                continue

            bill = Bill(str(session), chamber, bill_id, title,
                        type=self._bill_types[bill_type[1:]])
            bill.add_source(main_bill_url)
            bill_dict[bill_id] = bill

        #Sponsors
        bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, 'BILLSPON')

        for rec in bill_sponsors_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            name = rec["sponsor"]
            sponsor_type = rec["type"]
            if sponsor_type == 'P':
                sponsor_type = "Primary"
            else:
                sponsor_type = "Co-sponsor"
            bill.add_sponsor(sponsor_type, name)


        #Documents
        bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP')

        #print bill_document_db[2]
        for rec in bill_document_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            document = rec["document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))

            #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr,
                document.replace('.DOC', '.HTM'))

            # name document based _doctype
            try:
                doc_name = self._doctypes[rec['doctype']]
            except KeyError:
                raise Exception('unknown doctype %s on %s' %
                                (rec['doctype'], bill_id))
            if rec['comment']:
                doc_name += ' ' + rec['comment']

            if rec['doctype'] in self._version_types:
                bill.add_version(doc_name, htm_url)
            else:
                bill.add_document(doc_name, htm_url)

        # Votes
        next_year = int(year_abr)+1
        vote_info_list = ['A%s' % year_abr,
                          'A%s' % next_year,
                          'S%s' % year_abr,
                          'S%s' % next_year,
                          'CA%s-%s' % (year_abr, next_year),
                          'CS%s-%s' % (year_abr, next_year),
                         ]

        for filename in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename
            s_vote_zip, resp = self.urlretrieve(s_vote_url)
            zipedfile = zipfile.ZipFile(s_vote_zip)
            vfile = "%s.txt" % filename
            vote_file = zipedfile.open(vfile, 'U')
            vdict_file = csv.DictReader(vote_file)

            votes = {}
            if filename.startswith('A') or filename.startswith('CA'):
                chamber = "lower"
            else:
                chamber = "upper"

            if filename.startswith('C'):
                vote_file_type = 'committee'
            else:
                vote_file_type = 'chamber'

            for rec in vdict_file:

                if vote_file_type == 'chamber':
                    bill_id = rec["Bill"].strip()
                    leg = rec["Full_Name"]

                    date = rec["Session_Date"]
                    action = rec["Action"]
                    leg_vote = rec["Legislator_Vote"]
                else:
                    bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number'])
                    leg = rec['Name']
                    # drop time portion
                    date = rec['Agenda_Date'].split()[0]
                    # make motion readable
                    action = self._com_vote_motions[rec['BillAction']]
                    # first char (Y/N) use [0:1] to ignore ''
                    leg_vote = rec['LegislatorVote'][0:1]

                date = datetime.strptime(date, "%m/%d/%Y")
                vote_id = '_'.join((bill_id, chamber, action))
                vote_id = vote_id.replace(" ", "_")

                if vote_id not in votes:
                    votes[vote_id] = Vote(chamber, date, action, None, None,
                                          None, None, bill_id=bill_id)
                if vote_file_type == 'committee':
                    votes[vote_id]['committee'] = self._committees[
                        rec['Committee_House']]

                if leg_vote == "Y":
                    votes[vote_id].yes(leg)
                elif leg_vote == "N":
                    votes[vote_id].no(leg)
                else:
                    votes[vote_id].other(leg)

            # remove temp file
            os.remove(s_vote_zip)

            #Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count
                if vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        #Actions
        bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST')


        for rec in bill_action_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            action = rec["action"]
            date = rec["dateaction"]
            actor = rec["house"]
            comment = rec["comment"]
            action, atype = self.categorize_action(action)
            if comment:
                action += (' ' + comment)
            bill.add_action(actor, action, date, type=atype)

        # Subjects
        subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ')
        for rec in subject_db:
            bill_id = rec['billtype'] + str(int(rec['billnumber']))
            bill = bill_dict.get(bill_id)
            if bill:
                bill.setdefault('subjects', []).append(rec['subjectkey'])
            else:
                self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id)

        # save all bills at the end
        for bill in bill_dict.itervalues():
            # add sources
            bill.add_source(bill_sponsors_url)
            bill.add_source(bill_document_url)
            bill.add_source(bill_action_url)
            bill.add_source(subject_url)
            self.save_bill(bill)
Exemple #19
0
    def scrape_bill(self, chamber, session, bill_id, bill_type, url):
        doc = lxml.html.fromstring(self.get(url).text)
        doc.make_links_absolute(url)

        title = doc.xpath('//b[text()="TITLE:"]')
        if title:
            title = title[0].tail.strip().strip('"')
        else:
            self.warning("skipping bill %s, no information" % url)
            return

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)

        # Get sponsors
        spons_str = doc.xpath(
            '//b[contains(text(), "SPONSOR")]')[0].tail.strip()
        sponsors_match = re.match(
            '(SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str)
        if sponsors_match:
            sponsors = sponsors_match.group(2).split(',')
            sponsor = sponsors[0].strip()

            if sponsor:
                bill.add_sponsor('primary', sponsors[0])

            for sponsor in sponsors[1:]:
                sponsor = sponsor.strip()
                if sponsor:
                    bill.add_sponsor('cosponsor', sponsor)
        else:
            # Committee sponsorship
            spons_str = spons_str.strip()

            if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str):
                spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$', '',
                                   spons_str).title()
                spons_str = (spons_str +
                             " Committee (by request of the governor)")

            if spons_str:
                bill.add_sponsor('primary', spons_str)

        # Get actions from second myth table
        self._current_comm = None
        act_rows = doc.xpath('(//table[@class="myth"])[2]//tr')[1:]
        for row in act_rows:
            date, journal, raw_chamber, action = row.xpath('td')

            act_date = datetime.datetime.strptime(date.text_content().strip(),
                                                  '%m/%d/%y')
            raw_chamber = raw_chamber.text_content().strip()
            action = action.text_content().strip()

            if raw_chamber == "(H)":
                act_chamber = "lower"
            elif raw_chamber == "(S)":
                act_chamber = "upper"

            if re.match("\w+ Y(\d+)", action):
                vote_href = journal.xpath('.//a/@href')
                if vote_href:
                    self.parse_vote(bill, action, act_chamber, act_date,
                                    vote_href[0])

            action, atype = self.clean_action(action)

            match = re.match('^Prefile released (\d+/\d+/\d+)$', action)
            if match:
                action = 'Prefile released'
                act_date = datetime.datetime.strptime(match.group(1),
                                                      '%m/%d/%y')

            bill.add_action(act_chamber, action, act_date, type=atype)

        # Get subjects
        bill['subjects'] = []
        for subj in doc.xpath('//a[contains(@href, "subject")]/text()'):
            bill['subjects'].append(subj.strip())

        # Get versions
        text_list_url = "http://www.legis.state.ak.us/"\
            "basis/get_fulltext.asp?session=%s&bill=%s" % (
            session, bill_id)
        bill.add_source(text_list_url)

        text_doc = lxml.html.fromstring(self.get(text_list_url).text)
        text_doc.make_links_absolute(text_list_url)
        for link in text_doc.xpath('//a[contains(@href, "get_bill_text")]'):
            name = link.xpath('../preceding-sibling::td/text()')[0].strip()
            text_url = link.get('href')
            bill.add_version(name, text_url, mimetype="text/html")

        # Get documents
        doc_list_url = "http://www.legis.state.ak.us/"\
                "basis/get_documents.asp?session=%s&bill=%s" % (
                    session, bill_id )
        doc_list = lxml.html.fromstring(self.get(doc_list_url).text)
        doc_list.make_links_absolute(doc_list_url)
        bill.add_source(doc_list_url)
        for href in doc_list.xpath(
                '//a[contains(@href, "get_documents")][@onclick]'):
            h_name = href.text_content()
            h_href = href.attrib['href']
            if h_name.strip():
                bill.add_document(h_name, h_href)

        self.save_bill(bill)
Exemple #20
0
    def scrape_bill(self,
                    session,
                    chamber,
                    bill_id,
                    title,
                    url,
                    strip_sponsors=re.compile(r'\s*\(.{,50}\)\s*').sub):

        html = self.get(url).text

        page = lxml.html.fromstring(html)
        page.make_links_absolute(url)

        bill_type = self.bill_types[bill_id.split()[0][1:]]

        bill = Bill(session, chamber, bill_id, title, type=bill_type)
        bill.add_source(url)

        xpath = ('//strong[contains(., "SUBJECT")]/../'
                 'following-sibling::td/a/text()')
        bill['subjects'] = page.xpath(xpath)

        for version in self.scrape_versions(session, chamber, page, bill_id):
            bill.add_version(**version)

        # Resolution pages have different html.
        values = {}
        trs = page.xpath('//div[@id="bhistcontent"]/table/tr')
        for tr in trs:
            heading = tr.xpath('td/strong/text()')
            if heading:
                heading = heading[0]
            else:
                continue
            value = tr.text_content().replace(heading, '').strip()
            values[heading] = value

        # summary was always same as title
        #bill['summary'] = values['SUMMARY:']

        # Add primary sponsor.
        primary = strip_sponsors('', values.get('LEAD SPONSOR:', ''))
        if primary:
            bill.add_sponsor('primary', primary)

        # Add cosponsors.
        if values.get('SPONSORS:'):
            sponsors = strip_sponsors('', values['SPONSORS:'])
            sponsors = re.split(', (?![A-Z]\.)', sponsors)
            for name in sponsors:
                name = name.strip(', \n\r')
                if name:
                    # Fix name splitting bug where "Neale, D. Hall"
                    match = re.search('(.+?), ([DM]\. Hall)', name)
                    if match:
                        for name in match.groups():
                            bill.add_sponsor('cosponsor', name)
                    else:
                        bill.add_sponsor('cosponsor', name)

        for link in page.xpath("//a[contains(@href, 'votes/house')]"):
            self.scrape_house_vote(bill, link.attrib['href'])

        for tr in reversed(
                page.xpath("//table[@class='tabborder']/descendant::tr")[1:]):
            tds = tr.xpath('td')
            if len(tds) < 3:
                continue

            chamber_letter = tds[0].text_content()
            chamber = {'S': 'upper', 'H': 'lower'}[chamber_letter]

            # Index of date info no longer varies on resolutions.
            date = tds[2].text_content().strip()
            date = datetime.datetime.strptime(date, "%m/%d/%y").date()

            action = tds[1].text_content().strip()
            if action.lower().startswith('passed senate'):
                for href in tds[1].xpath('a/@href'):
                    self.scrape_senate_vote(bill, href, date)

            attrs = dict(actor=chamber, action=action, date=date)
            attrs.update(self.categorizer.categorize(action))
            bill.add_action(**attrs)

        self.save_bill(bill)
Exemple #21
0
    def scrape_bill(self, chamber, session, bill_id, bill_type):
        url = '%s?r=%s' % (self.base_url, bill_id)
        with self.urlopen(url) as html:
            doc = lxml.html.fromstring(html)

            # search for Titulo, accent over i messes up lxml, so use 'tulo'
            title = doc.xpath(
                u'//td/b[contains(text(),"tulo")]/../following-sibling::td/text()'
            )
            if not title:
                raise NoSuchBill()
            bill = Bill(session, chamber, bill_id, title[0], type=bill_type)
            author = doc.xpath(
                u'//td/b[contains(text(),"Autor")]/../text()')[0]
            for aname in author.split(','):
                bill.add_sponsor('primary', aname.strip())

            co_authors = doc.xpath(
                u'//td/b[contains(text(),"Co-autor")]/../text()')
            if len(co_authors) != 0:
                for co_author in co_authors[1].split(','):
                    bill.add_sponsor('cosponsor', co_author.strip())

            action_table = doc.xpath('//table')[-1]
            for row in action_table[1:]:
                tds = row.xpath('td')

                # ignore row missing date
                if len(tds) != 2:
                    continue

                date = datetime.datetime.strptime(tds[0].text_content(),
                                                  "%m/%d/%Y")

                action = tds[1].text_content().strip()
                #parse the text to see if it's a new version or a unrelated document
                #if has - let's *shrug* assume it's a vote document

                #get url of action
                action_url = tds[1].xpath('a/@href')

                #check it has a url and is not just text

                if action_url:
                    action_url = action_url[0]
                    #check if it's a version of the bill or another type of document.
                    #NOTE: not sure if new versions of the bill are only denoted with 'Entirillado' OR if that's the correct name but from what i gather it looks like it.
                    if re.match('Entirillado', action):
                        bill.add_version(action, action_url)
                    else:
                        bill.add_document(action, action_url)

                for pattern, atype in _classifiers:
                    if re.match(pattern, action):
                        break
                else:
                    atype = 'other'

                bill.add_action(chamber, action, date, type=atype)

                if atype == 'bill:passed' and action_url:
                    vote_chamber = None
                    for pattern, vote_chamber in _voteChambers:
                        if re.match(pattern, action):
                            break
                    else:
                        self.warning('coudnt find voteChamber pattern')

                    if vote_chamber == 'lower' and len(action_url) > 0:
                        vote = self.scrape_votes(action_url, action, date,
                                                 vote_chamber)
                        if not vote[0] == None:
                            vote[0].add_source(action_url)
                            bill.add_vote(vote[0])
                        else:
                            self.warning('Problem Reading vote: %s,%s' %
                                         (vote[1], bill_id))

            bill.add_source(url)
            self.save_bill(bill)
Exemple #22
0
    def scrape_bill(self, bill_url, chamber, session):
        with self.urlopen(bill_url) as text:
            if "Specified Bill could not be found" in text:
                return False
            page = lxml.html.fromstring(text)
            page.make_links_absolute(bill_url)

            bill_id = page.xpath("string(//h2)").split()[0]

            summary = page.xpath(
                "string(//*[starts-with(text(), 'Summary: ')])")
            summary = summary.replace('Summary: ', '')

            match = re.match(
                r"^([^:]+): "
                r"((\(Constitutional [aA]mendment\) )?[^(]+)", summary)

            if match:
                subjects = [match.group(1).strip()]
                title = match.group(2).strip()
            else:
                raise ScrapeError("Bad title")

            if bill_id.startswith('SB') or bill_id.startswith('HB'):
                bill_type = ['bill']
            elif bill_id.startswith('SR') or bill_id.startswith('HR'):
                bill_type = ['resolution']
            elif bill_id.startswith('SCR') or bill_id.startswith('HCR'):
                bill_type = ['concurrent resolution']
            else:
                raise ScrapeError("Invalid bill ID format: %s" % bill_id)

            if title.startswith("(Constitutional Amendment)"):
                bill_type.append('constitutional amendment')
                title = title.replace('(Constitutional Amendment) ', '')

            bill = Bill(session,
                        chamber,
                        bill_id,
                        title,
                        subjects=subjects,
                        type=bill_type)
            bill.add_source(bill_url)

            history_link = page.xpath("//a[text() = 'History']")[0]
            history_url = history_link.attrib['href']
            self.scrape_history(bill, history_url)

            authors_link = page.xpath("//a[text() = 'Authors']")[0]
            authors_url = authors_link.attrib['href']
            self.scrape_authors(bill, authors_url)

            try:
                versions_link = page.xpath(
                    "//a[text() = 'Text - All Versions']")[0]
                versions_url = versions_link.attrib['href']
                self.scrape_versions(bill, versions_url)
                for doc in ["Notes", "Digest", "Amendments", "Misc"]:
                    doc_link = page.xpath("//a[text() = '%s']" % doc)[0]
                    doc_url = doc_link.attrib['href']
                    self.scrape_documents(bill, doc_url)
            except IndexError:
                # Only current version
                try:
                    version_link = page.xpath(
                        "//a[text() = 'Text - Current']")[0]
                    version_url = version_link.attrib['href']
                    bill.add_version("%s Current" % bill_id,
                                     version_url,
                                     on_duplicate="use_old")
                except IndexError:
                    # Some bills don't have any versions :(
                    pass

            try:
                votes_link = page.xpath("//a[text() = 'Votes']")[0]
                self.scrape_votes(bill, votes_link.attrib['href'])
            except IndexError:
                # Some bills don't have any votes
                pass

            self.save_bill(bill)

            return True
Exemple #23
0
    def process_bill(self, data):
        chamber = parse_psuedo_id(data['from_organization'])['classification']
        if chamber == 'legislature':
            chamber = 'upper'
        bill = Bill(data['legislative_session'],
                    chamber,
                    data['identifier'],
                    data['title'],
                    subjects=data['subject'],
                    type=data['classification'])
        if data['abstracts']:
            bill['summary'] = data['abstracts'][0]['abstract']
        bill.update(**data['extras'])

        for action in data['actions']:
            actor = parse_psuedo_id(
                action['organization_id'])['classification']
            legislators = []
            committees = []
            for rel in action['related_entities']:
                if rel['entity_type'] == 'organization':
                    committees.append(rel['name'])
                elif rel['entity_type'] == 'person':
                    legislators.append(rel['name'])
            bill.add_action(actor,
                            action['description'],
                            parse_date(action['date']),
                            type=_action_categories(action['classification']),
                            committees=committees,
                            legislators=legislators,
                            **action.get('extras', {}))

        for source in data['sources']:
            bill.add_source(source['url'])

        for sponsor in data['sponsorships']:
            bill.add_sponsor(
                sponsor['classification'],
                sponsor['name'],
            )

        for version in data['versions']:
            for link in version['links']:
                bill.add_version(version['note'],
                                 link['url'],
                                 mimetype=link['media_type'],
                                 date=parse_date(version['date']),
                                 **version.get('extras', {}))

        for doc in data['documents']:
            for link in doc['links']:
                bill.add_document(doc['note'],
                                  link['url'],
                                  mimetype=link['media_type'],
                                  date=parse_date(doc['date']),
                                  **doc.get('extras', {}))

        for title in data['other_titles']:
            bill.add_title(title['title'])

        for related in data['related_bills']:
            bill.add_companion(related['identifier'],
                               related['legislative_session'], chamber)

        bill['alternate_bill_ids'] = [
            oi['identifier'] for oi in data['other_identifiers']
        ]
        self.save_bill(bill)
Exemple #24
0
    def parse_bill(self, chamber, session, bill_id, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            try:
                short_bill_id = re.sub(r'S([JC])R', r'S\1', bill_id)

                version_link = page.xpath(
                    "//a[contains(@href, '%s/bill.doc')]" % short_bill_id)[0]
            except IndexError:
                # Bill withdrawn
                return

            pars = version_link.xpath("following-sibling::p")
            if len(pars) == 2:
                title = pars[0].xpath("string()")
                action_p = pars[1]
            else:
                title = pars[0].getprevious().tail
                action_p = pars[0]

            title = re.sub(ur'[\s\xa0]+', ' ', title).strip()

            if 'CR' in bill_id:
                bill_type = 'concurrent resolution'
            elif 'JR' in bill_id:
                bill_type = 'joint resolution'
            elif 'R' in bill_id:
                bill_type = 'resolution'
            else:
                bill_type = 'bill'

            bill = Bill(session, chamber, bill_id, title, type=bill_type)
            bill['subjects'] = self._subjects[bill_id]
            bill.add_source(url)

            bill.add_version("Most Recent Version",
                             version_link.attrib['href'])

            for link in page.xpath("//a[contains(@href, 'legislator/')]"):
                bill.add_sponsor('primary', link.text.strip())

            for line in action_p.xpath("string()").split("\n"):
                action = line.strip()
                if (not action or action == 'last action' or
                    'Prefiled' in action):
                    continue

                action_date = "%s %s" % (action.split('-')[0],
                                         session[0:4])
                action_date = datetime.datetime.strptime(
                    action_date, '%b %d %Y')

                action = '-'.join(action.split('-')[1:])

                if action.endswith('House') or action.endswith('(H)'):
                    actor = 'lower'
                elif action.endswith('Senate') or action.endswith('(S)'):
                    actor = 'upper'
                else:
                    actor = chamber

                atype = []
                if action.startswith('introduced in'):
                    atype.append('bill:introduced')
                    if '; to ' in action:
                        atype.append('committee:referred')
                elif action.startswith('signed by Governor'):
                    atype.append('governor:signed')
                elif re.match(r'^to [A-Z]', action):
                    atype.append('committee:referred')
                elif action == 'adopted by voice vote':
                    atype.append('bill:passed')

                if '1st reading' in action:
                    atype.append('bill:reading:1')
                if '3rd reading' in action:
                    atype.append('bill:reading:3')
                if '2nd reading' in action:
                    atype.append('bill:reading:2')

                if 'R' in bill_id and 'adopted by voice vote' in action:
                    atype.append('bill:passed')

                amendment_re = (r'floor amendments?( \([a-z\d\-]+\))*'
                                r'( and \([a-z\d\-]+\))? filed')
                if re.search(amendment_re, action):
                    atype.append('amendment:introduced')

                if not atype:
                    atype = ['other']

                bill.add_action(actor, action, action_date, type=atype)

            try:
                votes_link = page.xpath(
                    "//a[contains(@href, 'vote_history.pdf')]")[0]
                bill.add_document("Vote History",
                                  votes_link.attrib['href'])
            except IndexError:
                # No votes
                pass

            self.save_bill(bill)
Exemple #25
0
    def scrape_bill_sheet(self, session, chamber):
        """
        Scrape the bill sheet (the page full of bills and other small bits of data)
        """
        sheet_url = self.get_bill_folder(session, chamber)

        bill_chamber = {"Senate": "upper", "House": "lower"}[chamber]

        index = {
            "id": 0,
            "title_sponsor": 1,
            "version": 2,
            "history": 3,
            "votes": 7
        }

        with self.urlopen(sheet_url) as sheet_html:
            sheet_page = lxml.html.fromstring(sheet_html)

            bills = sheet_page.xpath('//table/tr')

            for bill in bills:
                bill_id = self.read_td(bill[index["id"]][0])

                if bill_id == None:
                    # Every other entry is null for some reason
                    continue

                dot_loc = bill_id.find('.')
                if dot_loc != -1:
                    # budget bills are missing the .pdf, don't truncate
                    bill_id = bill_id[:dot_loc]
                title_and_sponsor = bill[index["title_sponsor"]][0]

                bill_title = title_and_sponsor.text
                bill_title_and_sponsor = title_and_sponsor.text_content()
                sponsors = bill_title_and_sponsor.replace(bill_title, "").\
                    replace(" & ...", "").split("--")

                cats = {
                    "SB": "bill",
                    "HB": "bill",
                    "HR": "resolution",
                    "SR": "resolution",
                    "SCR": "concurrent resolution",
                    "HCR": "concurrent resolution",
                    "SJR": "joint resolution",
                    "HJR": "joint resolution",
                    "SM": "memorial",
                    "HM": "memorial"
                }

                bill_type = None

                for cat in cats:
                    if bill_id[:len(cat)] == cat:
                        bill_type = cats[cat]

                b = Bill(session,
                         bill_chamber,
                         bill_id,
                         bill_title,
                         type=bill_type)

                b.add_source(sheet_url)

                versions_url = \
                    bill[index["version"]].xpath('font/a')[0].attrib["href"]
                versions_url = CO_URL_BASE + versions_url
                versions = self.parse_versions(versions_url)
                for version in versions:
                    b.add_version(version['name'],
                                  version['link'],
                                  mimetype=version['mimetype'])

                bill_history_href = CO_URL_BASE + \
                    bill[index["history"]][0][0].attrib['href']
                # ^^^^^^^ We assume this is a full path to the target.
                # might want to consider some better rel-path support
                # XXX: Look at this ^

                history = self.parse_history(bill_history_href)
                b.add_source(bill_history_href)

                for action in history:
                    self.add_action_to_bill(b, action)

                for sponsor in sponsors:
                    if sponsor != None and sponsor != "(NONE)" and \
                       sponsor != "":
                        b.add_sponsor("primary", sponsor)

                # Now that we have history, let's see if we can't grab some
                # votes

                bill_vote_href = self.get_vote_url(bill_id, session)
                votes = self.parse_votes(bill_vote_href)

                if votes['sanity-check'] != bill_id:
                    self.warning("XXX: READ ME! Sanity check failed!")
                    self.warning(" -> Scraped ID: " + votes['sanity-check'])
                    self.warning(" -> 'Real' ID:  " + bill_id)
                    assert votes['sanity-check'] == bill_id

                for vote in votes['votes']:
                    filed_votes = vote['votes']
                    passage = vote['meta']
                    result = vote['result']

                    composite_time = "%s %s" % (passage['x-parent-date'],
                                                passage['TIME'])
                    # It's now like: 04/01/2011 02:10:14 PM
                    pydate = dt.datetime.strptime(composite_time,
                                                  "%m/%d/%Y %I:%M:%S %p")
                    hasHouse = "House" in passage['x-parent-ctty']
                    hasSenate = "Senate" in passage['x-parent-ctty']

                    if hasHouse and hasSenate:
                        actor = "joint"
                    elif hasHouse:
                        actor = "lower"
                    else:
                        actor = "upper"

                    other = (int(result['EXC']) + int(result['ABS']))
                    # OK, sometimes the Other count is wrong.
                    local_other = 0
                    for voter in filed_votes:
                        l_vote = filed_votes[voter].lower().strip()
                        if l_vote != "yes" and l_vote != "no":
                            local_other = local_other + 1

                    if local_other != other:
                        self.warning( \
                            "XXX: !!!WARNING!!! - resetting the 'OTHER' VOTES" )
                        self.warning(" -> Old: %s // New: %s" %
                                     (other, local_other))
                        other = local_other

                    v = Vote(actor,
                             pydate,
                             passage['MOTION'],
                             (result['FINAL_ACTION'] == "PASS"),
                             int(result['YES']),
                             int(result['NO']),
                             other,
                             moved=passage['MOVED'],
                             seconded=passage['SECONDED'])

                    v.add_source(vote['meta']['url'])
                    # v.add_source( bill_vote_href )

                    # XXX: Add more stuff to kwargs, we have a ton of data
                    for voter in filed_votes:
                        who = voter
                        vote = filed_votes[who]
                        if vote.lower() == "yes":
                            v.yes(who)
                        elif vote.lower() == "no":
                            v.no(who)
                        else:
                            v.other(who)
                    b.add_vote(v)
                self.save_bill(b)
Exemple #26
0
    def scrape(self, session, chambers):
        #get member id matching for vote parsing
        member_ids = self.get_member_ids()[session]
        per_page = 10  #seems like it gives me 10 no matter what.
        start_record = 0

        headers = {"Content-Type": "application/json"}
        url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicAdvancedSearch"
        bill_url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicData"
        params = {
            "request": {
                "sEcho": 2,
                "iColumns": 4,
                "sColumns": "",
                "iDisplayStart": 0,
                "iDisplayLength": per_page,
                "mDataProp_0": "ShortTitle",
                "mDataProp_1": "Title",
                "mDataProp_2": "LegislationCategories",
                "mDataProp_3": "Modified",
                "iSortCol_0": 0,
                "sSortDir_0": "asc",
                "iSortingCols": 0,
                "bSortable_0": "true",
                "bSortable_1": "true",
                "bSortable_2": "true",
                "bSortable_3": "true"
            },
            "criteria": {
                "Keyword": "",
                "Category": "",
                "SubCategoryId": "",
                "RequestOf": "",
                "CouncilPeriod": str(session),
                "Introducer": "",
                "CoSponsor": "",
                "CommitteeReferral": "",
                "CommitteeReferralComments": "",
                "StartDate": "",
                "EndDate": "",
                "QueryLimit": 100,
                "FilterType": "",
                "Phases": "",
                "LegislationStatus": "0",
                "IncludeDocumentSearch": "false"
            }
        }
        param_json = json.dumps(params)
        response = self.post(url, headers=headers, data=param_json)
        #the response is a terrible string-of-nested-json-strings. Yuck.
        response = self.decode_json(response.json()["d"])
        data = response["aaData"]

        global bill_versions

        while len(data) > 0:

            for bill in data:

                bill_versions = [
                ]  #sometimes they're in there more than once, so we'll keep track

                bill_id = bill["Title"]
                if bill_id.startswith("AG"):
                    #actually an agenda, skip
                    continue
                bill_params = {"legislationId": bill_id}
                bill_info = self.post(bill_url,
                                      headers=headers,
                                      data=json.dumps(bill_params))
                bill_info = self.decode_json(bill_info.json()["d"])["data"]
                bill_source_url = "http://lims.dccouncil.us/Legislation/" + bill_id

                legislation_info = bill_info["Legislation"][0]
                title = legislation_info["ShortTitle"]

                if bill_id.startswith("R") or bill_id.startswith("CER"):
                    bill_type = "resolution"
                else:
                    bill_type = "bill"

                #dc has no chambers. calling it all upper
                bill = Bill(session, "upper", bill_id, title, type=bill_type)

                #sponsors and cosponsors
                if "Introducer" in legislation_info:
                    introducers = legislation_info["Introducer"]
                    intro_date = self.date_format(
                        legislation_info["IntroductionDate"])
                    bill.add_action("upper",
                                    "Introduced",
                                    intro_date,
                                    type="bill:introduced")
                else:
                    #sometimes there are introducers, sometimes not.
                    # Set Introducers to empty array to avoid downstream breakage, but log bills without introducers
                    self.logger.warning("No Introducer: {0} {1}: {2}".format(
                        bill['chamber'], bill['session'], bill['bill_id']))
                    introducers = []

                try:
                    #sometimes there are cosponsors, sometimes not.
                    cosponsors = legislation_info["CoSponsor"]
                except KeyError:
                    cosponsors = []

                for i in introducers:
                    sponsor_name = i["Name"]
                    #they messed up Phil Mendelson's name
                    if sponsor_name == "Phil Pmendelson":
                        sponsor_name = "Phil Mendelson"
                    bill.add_sponsor(name=sponsor_name, type="primary")
                for s in cosponsors:
                    sponsor_name = s["Name"]
                    if sponsor_name == "Phil Pmendelson":
                        sponsor_name = "Phil Mendelson"
                    bill.add_sponsor(name=sponsor_name, type="cosponsor")

                #if it's become law, add the law number as an alternate title
                if "LawNumber" in legislation_info:
                    law_num = legislation_info["LawNumber"]
                    if law_num:
                        bill.add_title(law_num)

                #also sometimes it's got an act number
                if "ActNumber" in legislation_info:
                    act_num = legislation_info["ActNumber"]
                    if act_num:
                        bill.add_title(act_num)

                #sometimes AdditionalInformation has a previous bill name
                if "AdditionalInformation" in legislation_info:
                    add_info = legislation_info["AdditionalInformation"]
                    if "previously" in add_info.lower():
                        prev_title = add_info.lower().replace(
                            "previously", "").strip().replace(" ", "")
                        bill.add_title(prev_title.upper())
                    elif add_info:
                        bill["additional_information"] = add_info

                if "WithDrawnDate" in legislation_info:
                    withdrawn_date = self.date_format(
                        legislation_info["WithDrawnDate"])
                    withdrawn_by = legislation_info["WithdrawnBy"][0][
                        "Name"].strip()
                    if withdrawn_by == "the Mayor":

                        bill.add_action("executive", "withdrawn",
                                        withdrawn_date, "bill:withdrawn")

                    elif "committee" in withdrawn_by.lower():
                        bill.add_action("upper",
                                        "withdrawn",
                                        withdrawn_date,
                                        "bill:withdrawn",
                                        committees=withdrawn_by)
                    else:
                        bill.add_action("upper",
                                        "withdrawn",
                                        withdrawn_date,
                                        "bill:withdrawn",
                                        legislators=withdrawn_by)

                #deal with actions involving the mayor
                mayor = bill_info["MayorReview"]
                if mayor != []:
                    mayor = mayor[0]

                    #in dc, mayor == governor because openstates schema
                    if "TransmittedDate" in mayor:
                        transmitted_date = self.date_format(
                            mayor["TransmittedDate"])

                        bill.add_action("executive",
                                        "transmitted to mayor",
                                        transmitted_date,
                                        type="governor:received")

                    if 'SignedDate' in mayor:
                        signed_date = self.date_format(mayor["SignedDate"])

                        bill.add_action("executive",
                                        "signed",
                                        signed_date,
                                        type="governor:signed")

                    elif 'ReturnedDate' in mayor:  #if returned but not signed, it was vetoed
                        veto_date = self.date_format(mayor["ReturnedDate"])

                        bill.add_action("executive",
                                        "vetoed",
                                        veto_date,
                                        type="governor:vetoed")

                        if 'EnactedDate' in mayor:  #if it was returned and enacted but not signed, there was a veto override
                            override_date = self.date_format(
                                mayor["EnactedDate"])

                            bill.add_action("upper",
                                            "veto override",
                                            override_date,
                                            type="bill:veto_override:passed")

                    if 'AttachmentPath' in mayor:
                        #documents relating to the mayor's review
                        self.add_documents(mayor["AttachmentPath"], bill)

                congress = bill_info["CongressReview"]
                if len(congress) > 0:
                    congress = congress[0]
                    if "TransmittedDate" in congress:
                        transmitted_date = self.date_format(
                            congress["TransmittedDate"])

                        bill.add_action("other",
                                        "Transmitted to Congress for review",
                                        transmitted_date)

                #deal with committee actions
                if "DateRead" in legislation_info:
                    date = legislation_info["DateRead"]
                elif "IntroductionDate" in legislation_info:
                    date = legislation_info["IntroductionDate"]
                else:
                    self.logger.warning(
                        "Crap, we can't find anything that looks like an action date. Skipping"
                    )
                    continue
                date = self.date_format(date)
                if "CommitteeReferral" in legislation_info:
                    committees = []
                    for committee in legislation_info["CommitteeReferral"]:
                        if committee["Name"].lower(
                        ) == "retained by the council":
                            committees = []
                            break
                        else:
                            committees.append(committee["Name"])
                    if committees != []:
                        bill.add_action("upper",
                                        "referred to committee",
                                        date,
                                        committees=committees,
                                        type="committee:referred")

                if "CommitteeReferralComments" in legislation_info:
                    committees = []
                    for committee in legislation_info[
                            "CommitteeReferralComments"]:
                        committees.append(committee["Name"])
                    bill.add_action("upper",
                                    "comments from committee",
                                    date,
                                    committees=committees,
                                    type="other")

                #deal with random docs floating around
                docs = bill_info["OtherDocuments"]
                for d in docs:
                    if "AttachmentPath" in d:
                        self.add_documents(d["AttachmentPath"], bill)
                    else:
                        self.logger.warning(
                            "Document path missing from 'Other Documents'")

                if "MemoLink" in legislation_info:
                    self.add_documents(legislation_info["MemoLink"], bill)

                if "AttachmentPath" in legislation_info:
                    self.add_documents(legislation_info["AttachmentPath"],
                                       bill)

                #full council votes
                votes = bill_info["VotingSummary"]
                for vote in votes:
                    self.process_vote(vote, bill, member_ids)

                #deal with committee votes
                if "CommitteeMarkup" in bill_info:
                    committee_info = bill_info["CommitteeMarkup"]
                    if len(committee_info) > 0:
                        for committee_action in committee_info:
                            self.process_committee_vote(committee_action, bill)
                        if "AttachmentPath" in committee_info:
                            self.add_documents(vote["AttachmentPath"], bill,
                                               is_version)

                bill.add_source(bill_source_url)
                self.save_bill(bill)

            #get next page
            start_record += per_page
            params["request"]["iDisplayStart"] = start_record
            param_json = json.dumps(params)
            response = self.post(url, headers=headers, data=param_json)
            response = self.decode_json(response.json()["d"])
            data = response["aaData"]
Exemple #27
0
    def scrape(self, chamber, session):
        # check for abiword
        if os.system('which abiword') != 0:
            raise ScrapeError('abiword is required for KS scraping')

        chamber_name = 'Senate' if chamber == 'upper' else 'House'
        chamber_letter = chamber_name[0]
        # perhaps we should save this data so we can make one request for both?
        with self.urlopen(ksapi.url + 'bill_status/') as bill_request:
            bill_request_json = json.loads(bill_request)
            bills = bill_request_json['content']
            for bill_data in bills:

                bill_id = bill_data['BILLNO']

                # filter other chambers
                if not bill_id.startswith(chamber_letter):
                    continue

                if 'CR' in bill_id:
                    btype = 'concurrent resolution'
                elif 'R' in bill_id:
                    btype = 'resolution'
                elif 'B' in bill_id:
                    btype = 'bill'

                title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE']

                # main
                bill = Bill(session,
                            chamber,
                            bill_id,
                            title,
                            type=btype,
                            status=bill_data['STATUS'])
                bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower())

                if (bill_data['LONGTITLE']
                        and bill_data['LONGTITLE'] != bill['title']):
                    bill.add_title(bill_data['LONGTITLE'])

                for sponsor in bill_data['SPONSOR_NAMES']:
                    stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1
                             else 'cosponsor')
                    bill.add_sponsor(stype, sponsor)

                # history is backwards
                for event in reversed(bill_data['HISTORY']):

                    actor = ('upper'
                             if event['chamber'] == 'Senate' else 'lower')

                    date = datetime.datetime.strptime(
                        event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S")
                    # append committee names if present
                    if 'committee_names' in event:
                        action = (event['status'] + ' ' +
                                  ' and '.join(event['committee_names']))
                    else:
                        action = event['status']

                    if event['action_code'] not in ksapi.action_codes:
                        self.warning(
                            'unknown action code on %s: %s %s' %
                            (bill_id, event['action_code'], event['status']))
                        atype = 'other'
                    else:
                        atype = ksapi.action_codes[event['action_code']]
                    bill.add_action(actor, action, date, type=atype)

                try:
                    self.scrape_html(bill)
                except scrapelib.HTTPError as e:
                    self.warning('unable to fetch HTML for bill {0}'.format(
                        bill['bill_id']))
                self.save_bill(bill)
Exemple #28
0
    def scrape_details(self, bill_detail_url, session, chamber, bill_id):
        page = self.urlopen(bill_detail_url)

        if 'INVALID BILL NUMBER' in page:
            self.warning('INVALID BILL %s' % bill_detail_url)
            return

        doc = lxml.html.fromstring(page)
        doc.make_links_absolute(bill_detail_url)

        bill_div = doc.xpath('//div[@style="margin:0 0 40px 0;"]')[0]

        bill_type = bill_div.xpath('span/text()')[0]

        if 'General Bill' in bill_type:
            bill_type = 'bill'
        elif 'Concurrent Resolution' in bill_type:
            bill_type = 'concurrent resolution'
        elif 'Joint Resolution' in bill_type:
            bill_type = 'joint resolution'
        elif 'Resolution' in bill_type:
            bill_type = 'resolution'
        else:
            raise ValueError('unknown bill type: %s' % bill_type)

        # this is fragile, but less fragile than it was
        b = bill_div.xpath('./b[text()="Summary:"]')[0]
        bill_summary = b.getnext().tail.strip()

        bill = Bill(session, chamber, bill_id, bill_summary, type=bill_type)
        bill['subjects'] = list(self._subjects[bill_id])

        # sponsors
        for sponsor in doc.xpath('//a[contains(@href, "member.php")]/text()'):
            bill.add_sponsor('sponsor', sponsor)

        # find versions
        version_url = doc.xpath('//a[text()="View full text"]/@href')[0]
        version_html = self.urlopen(version_url)
        version_doc = lxml.html.fromstring(version_html)
        version_doc.make_links_absolute(version_url)
        for version in version_doc.xpath('//a[contains(@href, "/prever/")]'):
            # duplicate versions with same date, use first appearance
            bill.add_version(version.text,
                             version.get('href'),
                             on_duplicate='use_old')

        # actions
        for row in bill_div.xpath('table/tr'):
            date_td, chamber_td, action_td = row.xpath('td')

            date = datetime.datetime.strptime(date_td.text, "%m/%d/%y")
            action_chamber = {
                'Senate': 'upper',
                'House': 'lower',
                None: 'other'
            }[chamber_td.text]

            action = action_td.text_content()
            action = action.split('(House Journal')[0]
            action = action.split('(Senate Journal')[0].strip()

            atype = action_type(action)
            bill.add_action(action_chamber, action, date, atype)

        # votes
        vurl = doc.xpath('//a[text()="View Vote History"]/@href')
        if vurl:
            vurl = vurl[0]
            self.scrape_vote_history(bill, vurl)

        bill.add_source(bill_detail_url)
        self.save_bill(bill)
Exemple #29
0
    def scrape_senate_bills(self, chamber, insert, session, year):
        doc_type = {
            2: 'bill',
            4: 'resolution',
            7: 'concurrent resolution',
            8: 'joint resolution'
        }

        for docnum, bill_type in doc_type.iteritems():
            parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/HistListBills.cfm?DoctypeID=%s' % (
                insert, docnum)
            links = self.scrape_links(parentpage_url)
            count = 0
            for link in links:
                count = count + 1
                page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (
                    insert, link)

                with self.urlopen(page_path) as page:
                    page = page.decode("utf8").replace(u"\xa0", " ")
                    root = lxml.html.fromstring(page)

                    bill_id = root.xpath(
                        'string(/html/body/div[@id="content"]/table[1]/tr[1]/td[1]/font)'
                    )
                    title = root.xpath(
                        'string(/html/body/div[@id="content"]/table[1]/tr[5]/td)'
                    )

                    bill = Bill(session,
                                chamber,
                                bill_id,
                                title,
                                type=bill_type)

                    bill_text = root.xpath(
                        "string(/html/body/div[@id='content']/table[6]/tr/td[2]/a/@href)"
                    )
                    text_url = "http://www.leg.state.nv.us" + bill_text
                    bill.add_version("Bill Text", text_url)

                    primary, secondary = self.scrape_sponsors(page)

                    if primary and primary[0] == 'By:':
                        primary.pop(0)

                        if primary[0] == 'ElectionsProceduresEthicsand':
                            primary[0] = 'Elections Procedures Ethics and'

                        full_name = ''
                        for part_name in primary:
                            full_name = full_name + part_name + " "
                        bill.add_sponsor('primary', full_name)
                    else:
                        for leg in primary:
                            bill.add_sponsor('primary', leg)
                    for leg in secondary:
                        bill.add_sponsor('cosponsor', leg)

                    minutes_count = 2
                    for mr in root.xpath('//table[4]/tr/td[3]/a'):
                        minutes = mr.xpath("string(@href)")
                        minutes_url = "http://www.leg.state.nv.us" + minutes
                        minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count
                        minutes_date = mr.xpath(minutes_date_path).split()
                        minutes_date = minutes_date[0] + minutes_date[
                            1] + minutes_date[2] + " Minutes"
                        bill.add_document(minutes_date, minutes_url)
                        minutes_count = minutes_count + 1

                    self.scrape_actions(root, bill, "upper")
                    self.scrape_votes(page, bill, insert, year)
                    bill.add_source(page_path)
                    self.save_bill(bill)
Exemple #30
0
    def old_scrape(self, session):
        status_report_url = "http://www.legislature.ohio.gov/legislation/status-reports"

        #ssl verification off due Ohio not correctly implementing SSL
        doc = self.get(status_report_url, verify=False).text
        doc = lxml.html.fromstring(doc)
        doc.make_links_absolute(status_report_url)

        status_table = doc.xpath(
            "//div[contains(text(),'{}')]/following-sibling::table".format(
                session))[0]
        status_links = status_table.xpath(
            ".//a[contains(text(),'Excel')]/@href")

        for url in status_links:

            try:
                fname, resp = self.urlretrieve(url)
            except scrapelib.HTTPError:
                self.logger.warning("Missing report {}".format(report))
                continue

            sh = xlrd.open_workbook(fname).sheet_by_index(0)

            # once workbook is open, we can remove tempfile
            os.remove(fname)
            for rownum in range(1, sh.nrows):
                bill_id = sh.cell(rownum, 0).value

                bill_type = "resolution" if "R" in bill_id else "bill"
                chamber = "lower" if "H" in bill_id else "upper"

                bill_title = str(sh.cell(rownum, 3).value)

                bill = Bill(session,
                            chamber,
                            bill_id,
                            bill_title,
                            type=bill_type)
                bill.add_source(url)
                bill.add_sponsor('primary', str(sh.cell(rownum, 1).value))

                # add cosponsor
                if sh.cell(rownum, 2).value:
                    bill.add_sponsor('cosponsor',
                                     str(sh.cell(rownum, 2).value))

                actor = ""

                # Actions start column after bill title
                for colnum in range(4, sh.ncols - 1):
                    action = str(sh.cell(0, colnum).value)
                    cell = sh.cell(rownum, colnum)
                    date = cell.value

                    if len(action) != 0:
                        if action.split()[0] == 'House':
                            actor = "lower"
                        elif action.split()[0] == 'Senate':
                            actor = "upper"
                        elif action.split()[-1] == 'Governor':
                            actor = "executive"
                        elif action.split()[0] == 'Gov.':
                            actor = "executive"
                        elif action.split()[-1] == 'Gov.':
                            actor = "executive"

                    if action in ('House Intro. Date', 'Senate Intro. Date'):
                        atype = ['bill:introduced']
                        action = action.replace('Intro. Date', 'Introduced')
                    elif action == '3rd Consideration':
                        atype = ['bill:reading:3', 'bill:passed']
                    elif action == 'Sent to Gov.':
                        atype = ['governor:received']
                    elif action == 'Signed By Governor':
                        atype = ['governor:signed']
                    else:
                        atype = ['other']

                    if type(date) == float:
                        date = str(xlrd.xldate_as_tuple(date, 0))
                        date = datetime.datetime.strptime(
                            date, "(%Y, %m, %d, %H, %M, %S)")
                        bill.add_action(actor, action, date, type=atype)

                for idx, char in enumerate(bill_id):
                    try:
                        int(char)
                    except ValueError:
                        continue

                    underscore_bill = bill_id[:idx] + "_" + bill_id[idx:]
                    break

                self.scrape_votes_old(bill, underscore_bill, session)
                self.scrape_versions_old(bill, underscore_bill, session)
                self.save_bill(bill)