Ejemplo n.º 1
0
 def scrape_session_2009(self, chamber, session):
     url, type = bills_url(chamber)
                 
     with self.urlopen(url) as page_html:
         page = lxml.html.fromstring(page_html)
         for element, attribute, link, pos in page.iterlinks():         
             if re.search("billtype=" + type + "&billnumber=[0-9]+", link) != None:
                 bill_page_url = bill_url(link)
                 with self.urlopen(bill_page_url) as bill_page_str:
                     bill_page = lxml.html.fromstring(bill_page_str)
                     splitted_link = link.split("=")
                     bill_number = splitted_link[-1]
                     bill_id = bill_page.cssselect('a[class="headerlink"]')
                     bill_id = bill_id[0]
                     bill_id = bill_id.text_content()
                     bill_title = bill_page.cssselect('td[style="color:Black"]')
                     bill_title = bill_title[0]
                     bill_title = bill_title.text_content()
                     bill = Bill(session, chamber, bill_id, bill_title)
                     bill.add_source(bill_page_url)
                     
                     actions_table_list = bill_page.cssselect('table[rules="all"]')
                     actions_table = actions_table_list[0]
                     action_elements = actions_table.cssselect('tr')
                     # first element is not an action element
                     action_elements.pop(0)
                     
                     for ae in action_elements:
                         action_element_parts = ae.cssselect('td')
                         
                         action_date = action_element_parts[0]
                         actor_house = action_element_parts[1]
                         action_text = action_element_parts[2]
                         
                         # look for acting comittees
                         match = re.search("(committee\(s\)|committee) on ([A-Z]{3}(/|-)[A-Z]{3}|[A-Z]{3})", action_text.text_content())
 
                         if(match != None):
                             actor = match.group(0)
                         elif(actor_house == 'H'):
                             actor = "lower"
                         elif (actor_house == 'S'):
                             actor = "upper"
                         else:
                             actor = chamber
                         
                         action_date = dt.datetime.strptime(action_date.text_content(), '%m/%d/%Y') 
                             
                         if (re.search("The votes were as follows", action_text.text_content()) != None):
                             self.scrape_votes(action_text.text_content(), bill_page_url, actor_house, action_date, bill)
                                                        
                         bill.add_action(actor, action_text, action_date)
                     
                     with self.urlopen(versions_page_url(type, bill_number)) as versions_page_html:
                         versions_page = lxml.html.fromstring(versions_page_html)
                         versions_elements = versions_page.cssselect('span[class="searchtitle"]')
                         for ve in versions_elements:
                             element_text = ve.text_content()
                             version_name = element_text.rstrip("_.HTM")
                             bill.add_version(version_name, bill_version_url(element_text))
Ejemplo n.º 2
0
    def scrape2009(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2009_10/sum/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Bill
            name = page.cssselect('#legislation h1')[0].text_content().strip()

            bill_id = name.split(' - ')[0].strip()

            bill = Bill(session, chamberName, bill_id, name)

            # Sponsorships
            for a in page.cssselect("#sponsors a"):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            for row in page.cssselect('#history tr')[1:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()

                if '/' not in date:
                    continue

                date = datetime.datetime.strptime(date, '%m/%d/%Y')

                if action_text.startswith('Senate'):
                    bill.add_action('upper', action_text, date)
                elif action_text.startswith('House'):
                    bill.add_action('lower', action_text, date)

            # Versions
            for row in page.cssselect('#versions a'):
                bill.add_version(a.text_content(),
                                 urlparse.urljoin(url, a.get('href')))

            self.save_bill(bill)
Ejemplo n.º 3
0
    def scrape2003(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/2003_04/sum/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Grab the interesting tables on the page.
            tables = page.cssselect('center table')

            # Bill
            name = tables[0].text_content().split('-', 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Sponsorships
            for a in tables[1].cssselect('a'):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            center = page.cssselect('center table center')[0]

            for row in center.cssselect('table')[-2].cssselect('tr')[2:]:
                date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                if '/' not in date:
                    continue
                if action_text.startswith('Senate'):
                    bill.add_action('upper', action_text, date)
                elif action_text.startswith('House'):
                    bill.add_action('lower', action_text, date)

            # Versions
            for row in center.cssselect('table')[-1].cssselect('a'):
                bill.add_version(a.text_content(),
                                 urlparse.urljoin(url, a.get('href')))

            self.save_bill(bill)
Ejemplo n.º 4
0
    def scrape1999(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1999_00/leg/sum/sb1.htm"
        with self.lxml_context(url) as lxml:
            # Grab the interesting tables on the page.
            tables = page.cssselect('table')

            # Bill
            name = tables[1].cssselect('a')[0].text_content().split('-', 1)[1]
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version('Current', url.replace('/sum/', '/fulltext/'))

            # Sponsorships
            for a in tables[2].cssselect('a'):
                bill.add_sponsor('', a.text_content().strip())

            # Actions
            for row in tables[-1].cssselect('tr'):
                senate_date = row[0].text_content().strip()
                action_text = row[1].text_content().strip()
                house_date = row[2].text_content().strip()
                if '/' not in senate_date and '/' not in house_date:
                    continue
                if senate_date:
                    bill.add_action('upper', action_text, senate_date)
                if house_date:
                    bill.add_action('lower', action_text, house_date)

            self.save_bill(bill)
Ejemplo n.º 5
0
    def parse_bill(self, chamber, session, bill_id, bill_info_url):
        with self.urlopen(bill_info_url) as bill_info_data:
            bill_info = self.soup_parser(bill_info_data)
            version_url = '%s/bill.doc' % bill_id
            version_link = bill_info.find(href=version_url)

            if not version_link:
                # This bill was withdrawn
                return

            bill_title = version_link.findNext('p').contents[0].strip()

            bill = Bill(session, chamber, bill_id, bill_title)
            bill.add_version("Most Recent Version",
                             session_url(session) + version_url)
            bill.add_source(bill_info_url)

            sponsor_links = bill_info.findAll(href=re.compile(
                    'legislator/[SH]\d+\.htm'))

            for sponsor_link in sponsor_links:
                bill.add_sponsor('primary', sponsor_link.contents[0].strip())

            action_p = version_link.findAllNext('p')[-1]
            for action in action_p.findAll(text=True):
                action = action.strip()
                if (not action or action == 'last action' or
                    'Prefiled' in action):
                    continue

                action_date = action.split('-')[0]
                action_date = dt.datetime.strptime(action_date, '%b %d')
                # Fix:
                action_date = action_date.replace(
                    year=int('20' + session[2:4]))

                action = '-'.join(action.split('-')[1:])

                if action.endswith('House') or action.endswith('(H)'):
                    actor = 'lower'
                elif action.endswith('Senate') or action.endswith('(S)'):
                    actor = 'upper'
                else:
                    actor = chamber

                bill.add_action(actor, action, action_date)

            vote_link = bill_info.find(href=re.compile('.*/vote_history.pdf'))
            if vote_link:
                bill.add_document(
                    'vote_history.pdf',
                    bill_info_url.replace('.htm', '') + "/vote_history.pdf")

            self.save_bill(bill)
Ejemplo n.º 6
0
    def scrape_bill(self, chamber, session, billid, histurl, year):
        if year[0] != 'R':
            session = year
        else:
            session = self.metadata['session_details'][year][
                'sub_sessions'][int(year[0]) - 1]

        with self.urlopen(histurl) as data:
            soup = BeautifulSoup(cleansource(data))
            basicinfo = soup.findAll('div', id='bhistleft')[0]
            hist = basicinfo.table

            sponsor = None
            title = None
            for b in basicinfo.findAll('b'):
                if b.next.startswith('SUMMARY'):
                    title = b.findNextSiblings(text=True)[0].strip()
                elif b.next.startswith('SPONSOR'):
                    for a in b.findNextSiblings('a'):
                        if not issponsorlink(a):
                            break
                        sponsor = cleansponsor(a.contents[0])

            bill = Bill(session, chamber, billid, title)

            if sponsor:
                bill.add_sponsor('primary', sponsor)

            for row in hist.findAll('tr'):
                link = row.td.a
                vlink = urlbase % link['href']
                vname = link.contents[0].strip()
                bill.add_version(vname, vlink)

            history = soup.findAll('div', id='bhisttab')[0].table
            rows = history.findAll('tr')[1:]
            for row in rows:
                tds = row.findAll('td')
                if len(tds) < 2:
                    # This is not actually an action
                    continue
                date, action = row.findAll('td')[:2]
                date = dt.datetime.strptime(date.contents[0], '%m/%d/%y')
                action = action.contents[0].strip()
                if 'House' in action:
                    actor = 'lower'
                elif 'Senate' in action:
                    actor = 'upper'
                else:  # for lack of a better
                    actor = chamber

                bill.add_action(actor, action, date)

        self.save_bill(bill)
Ejemplo n.º 7
0
    def get_bill_info(self, chamber, session, bill_detail_url, version_list_url):
        """Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        if chamber == "House":
            chamber = 'lower'
        else:
            chamber = 'upper'

        with self.urlopen(bill_detail_url) as bill_html:
            bill_soup = BeautifulSoup(bill_html)

            bill_id = self.extract_bill_id(bill_soup)
            bill_title =  self.extract_bill_title(bill_soup)
            bill = Bill(session, chamber, bill_id, bill_title)

        # Get all versions of the bill.
        # Versions of a bill are on a separate page, linked to from the column
        # labeled, "Bill Text", on the search results page.

        with self.urlopen(version_list_url) as version_html:
            version_soup = BeautifulSoup(version_html)

            # MN bills can have multiple versions.  Get them all, and loop over
            # the results, adding each one.
            self.debug("Extracting bill versions from: " + version_list_url)
            bill_versions = self.extract_bill_versions(version_soup)
            for version in bill_versions:
                version_name = version['name']
                version_url = urlparse.urljoin(VERSION_URL_BASE, version['url'])
                bill.add_version(version_name, version_url)

            # grab primary and cosponsors
            # MN uses "Primary Author" to name a bill's primary sponsor.
            # Everyone else listed will be added as a 'cosponsor'.
            sponsors = self.extract_bill_sponsors(bill_soup)
            primary_sponsor = sponsors[0]
            cosponsors = sponsors[1:]
            bill.add_sponsor('primary', primary_sponsor)
            for leg in cosponsors:
                bill.add_sponsor('cosponsor', leg)

            # Add Actions performed on the bill.
            bill_actions = self.extract_bill_actions(bill_soup, chamber)
            for action in bill_actions:
                action_chamber = action['action_chamber']
                action_date = action['action_date']
                action_text = action['action_text']
                bill.add_action(action_chamber, action_text, action_date)

        self.save_bill(bill)
Ejemplo n.º 8
0
    def scrape1995(self, url, year, chamberName, session, number):
        "e.g. http://www.legis.ga.gov/legis/1995_96/leg/sum/sb1.htm"
        with self.lxml_context(url) as page:
            # Bill
            name = page.cssselect('h3 br')[0].tail.split('-', 1)[1].strip()
            bill = Bill(session, chamberName, number, name)

            # Versions
            bill.add_version('Current', url.replace('/sum/', '/fulltext/'))

            # Sponsorships
            rows = page.cssselect('center table tr')
            for row in rows:
                if row.text_content().strip() == 'Sponsor and CoSponsors':
                    continue
                if row.text_content().strip() == 'Links / Committees / Status':
                    break
                for a in row.cssselect('a'):
                    bill.add_sponsor('', a.text_content().strip())

            # Actions
            # The actions are in a pre table that looks like:
            """    SENATE                         HOUSE
                   -------------------------------------
                 1/13/95   Read 1st time          2/6/95
                 1/31/95   Favorably Reported
                 2/1/95    Read 2nd Time          2/7/95
                 2/3/95    Read 3rd Time
                 2/3/95    Passed/Adopted                   """

            actions = page.cssselect('pre')[0].text_content().split('\n')
            actions = actions[2:]
            for action in actions:
                senate_date = action[:22].strip()
                action_text = action[23:46].strip()
                house_date = action[46:].strip()

                if '/' not in senate_date and '/' not in house_date:
                    continue

                if senate_date:
                    bill.add_action('upper', action_text, senate_date)

                if house_date:
                    bill.add_action('lower', action_text, house_date)

            self.save_bill(bill)
Ejemplo n.º 9
0
    def get_bill_info(self, chamber, session, bill_detail_url, version_list_url):
        """Extracts all the requested info for a given bill.

        Calls the parent's methods to enter the results into JSON files.
        """
        if chamber == "House":
            chamber = 'lower'
        else:
            chamber = 'upper'

        with self.urlopen(bill_detail_url) as bill_html:
            doc = lxml.html.fromstring(bill_html)

            bill_id = doc.xpath('//title/text()')[0].split()[0]
            bill_title = doc.xpath('//font[@size=-1]/text()')[0]
            bill_type = {'F': 'bill', 'R':'resolution',
                         'C': 'concurrent resolution'}[bill_id[1]]
            bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)
            bill.add_source(bill_detail_url)

            # grab sponsors
            sponsors = doc.xpath('//table[@summary="Show Authors"]/descendant::a/text()')
            if sponsors:
                primary_sponsor = sponsors[0].strip()
                bill.add_sponsor('primary', primary_sponsor)
                cosponsors = sponsors[1:]
                for leg in cosponsors:
                    bill.add_sponsor('cosponsor', leg.strip())

            # Add Actions performed on the bill.
            bill_actions = self.extract_bill_actions(doc, chamber)
            for action in bill_actions:
                bill.add_action(action['action_chamber'],
                                action['action_text'],
                                action['action_date'],
                                type=action['action_type'])

        # Get all versions of the bill.
        # Versions of a bill are on a separate page, linked to from the column
        # labeled, "Bill Text", on the search results page.
        with self.urlopen(version_list_url) as version_html:
            version_doc = lxml.html.fromstring(version_html)
            for v in version_doc.xpath('//a[starts-with(@href, "/bin/getbill.php")]'):
                version_url = urlparse.urljoin(VERSION_URL_BASE, v.get('href'))
                bill.add_version(v.text.strip(), version_url)

        self.save_bill(bill)
Ejemplo n.º 10
0
    def scrape(self, chamber, session):
        self.validate_session(session)

        if chamber == 'upper':
            other_chamber = 'lower'
            bill_id = 'SB 1'
        else:
            other_chamber = 'upper'
            bill_id = 'HB 1'

        b1 = Bill(session, chamber, bill_id, 'A super bill')
        b1.add_source('http://example.com/')
        b1.add_version('As Introduced', 'http://example.com/SB1.html')
        b1.add_document('Google', 'http://google.com')
        b1.add_sponsor('primary', 'Bob Smith')
        b1.add_sponsor('secondary', 'Johnson, Sally')

        d1 = datetime.datetime.strptime('1/29/2010', '%m/%d/%Y')
        v1 = Vote('upper', d1, 'Final passage', True, 2, 0, 0)
        v1.yes('Smith')
        v1.yes('Johnson')

        d2 = datetime.datetime.strptime('1/30/2010', '%m/%d/%Y')
        v2 = Vote('lower', d2, 'Final passage', False, 0, 1, 1)
        v2.no('Bob Smith')
        v2.other('S. Johnson')

        b1.add_vote(v1)
        b1.add_vote(v2)

        b1.add_action(chamber, 'introduced', d1)
        b1.add_action(chamber, 'read first time', d2)
        b1.add_action(other_chamber, 'introduced', d2)

        self.save_bill(b1)
Ejemplo n.º 11
0
def parse_bill(scraper, url):
    """Given a bill status URL, return a fully loaded Bill object, except for votes, which
       are expected to be handled externally.
    """
    session = extract_session(url)
    chamber = chamber_for_doctype(extract_doctype(url))
    s = get_soup(scraper, url)
    bill_id = extract_bill_id(s)
    landmark = s(text=re.compile(".*Short Description.*"))
    name_span = landmark[0].findParent().findNextSibling()
    bill_name = get_text(name_span)
    bill = Bill(session, chamber, bill_id, bill_name.strip(),status_url=url)
    actions = extract_actions(s)
    for chamber,action,date in actions:
        bill.add_action(chamber,action,date) #kwargs are permitted if we have 'em.  
    sponsor_dict = extract_sponsors_from_actions([action[1] for action in actions])
    for type,namelist in sponsor_dict.iteritems():
        for name in namelist:
            bill.add_sponsor(type,name)
    for name,link in extract_versions(scraper, s):
        bill.add_version(name,link)
    return bill
Ejemplo n.º 12
0
    def scrape_bill(self, chamber, session, bill_number, ga_num):
        bill_url = self.urls['info'] % (bill_number, ga_num)

        with self.urlopen(bill_url) as page:
            page = lxml.html.fromstring(page)
            title = page.xpath("//span[@id='lblAbstract']")[0].text
            
            bill = Bill(session, chamber, bill_number, title)
            bill.add_source(bill_url)
            
            # Primary Sponsor
            sponsor = page.xpath("//span[@id='lblBillSponsor']")[0].text_content().split("by")[-1]
            sponsor = sponsor.replace('*','').strip()
            bill.add_sponsor('primary',sponsor)
            
            # Co-sponsors unavailable for scraping (loaded into page via AJAX)
            
            # Full summary doc
            summary = page.xpath("//span[@id='lblBillSponsor']/a")[0]
            bill.add_document('Full summary', summary.get('href'))
            
            # Actions
            tables = page.xpath("//table[@id='tabHistoryAmendments_tabHistory_gvBillActionHistory']")
            actions_table = tables[0]
            action_rows = actions_table.xpath("tr[position()>1]")
            for ar in action_rows:
                action_taken = ar.xpath("td")[0].text
                action_date = datetime.datetime.strptime(ar.xpath("td")[1].text.strip(), '%m/%d/%Y')
                bill.add_action(chamber, action_taken, action_date)

            votes_link = page.xpath("//span[@id='lblBillVotes']/a")
            if(len(votes_link) > 0):
                votes_link = votes_link[0].get('href')
                bill = self.scrape_votes(bill, sponsor, 'http://wapp.capitol.tn.gov/apps/Billinfo/%s' % (votes_link,))

            self.save_bill(bill)
Ejemplo n.º 13
0
    def parse_bill_xml(self, chamber, session, txt):
        root = lxml.etree.fromstring(txt)
        bill_id = ' '.join(root.attrib['bill'].split(' ')[1:])
        bill_title = root.findtext("caption")

        if session[2] == 'R':
            session = session[0:2]

        bill = Bill(session, chamber, bill_id, bill_title)

        for action in root.findall('actions/action'):
            act_date = dt.datetime.strptime(action.findtext('date'),
                                            "%m/%d/%Y")

            extra = {}
            extra['action_number'] = action.find('actionNumber').text
            comment = action.find('comment')
            if comment is not None and comment.text:
                extra['comment'] = comment.text.strip()

            actor = {'H': 'lower',
                     'S': 'upper',
                     'E': 'executive'}[extra['action_number'][0]]

            desc = action.findtext('description').strip()

            if desc == 'Amended':
                type = 'amendment:passed'
            elif desc == 'Amendment(s) offered':
                type = 'amendment:introduced'
            elif desc == 'Amendment amended':
                type = 'amendment:amended'
            elif desc == 'Amendment withdrawn':
                type = 'amendment:withdrawn'
            elif desc.startswith('Received by the Secretary of'):
                type = 'bill:introduced'
            elif desc == 'Passed':
                type = 'bill:passed'
            elif desc.startswith('Received from the'):
                type = 'bill:introduced'
            elif desc.startswith('Signed by the Governor'):
                type = 'bill:signed'
            elif desc == 'Filed':
                type = 'bill:introduced'
            else:
                type = 'other'

            bill.add_action(actor, action.findtext('description'),
                            act_date, type=type, **extra)

        for author in root.findtext('authors').split(' | '):
            if author != "":
                bill.add_sponsor('author', author)
        for coauthor in root.findtext('coauthors').split(' | '):
            if coauthor != "":
                bill.add_sponsor('coauthor', coauthor)
        for sponsor in root.findtext('sponsors').split(' | '):
            if sponsor != "":
                bill.add_sponsor('sponsor', sponsor)
        for cosponsor in root.findtext('cosponsors').split(' | '):
            if cosponsor != "":
                bill.add_sponsor('cosponsor', cosponsor)

        bill['subjects'] = []
        for subject in root.iterfind('subjects/subject'):
            bill['subjects'].append(subject.text.strip())

        return bill
Ejemplo n.º 14
0
    def scrape_session_old(self, chamber, session):
        if chamber == "lower":
            bill_abbr = "H."
            chamber_name = "House"
            other_chamber = "Senate"
        else:
            bill_abbr = "S."
            chamber_name = "Senate"
            other_chamber = "House"

        start_date = '1/1/%s' % session.split('-')[0]
        data = urllib.urlencode({'Date': start_date,
                                 'Body': bill_abbr[0],
                                 'Session': session.split('-')[1]})
        bill_list_url = "http://www.leg.state.vt.us/database/"\
            "rintro/results.cfm"
        bill_list = BeautifulSoup(urllib2.urlopen(bill_list_url, data))

        bill_link_re = re.compile('.*?Bill=%s.\d+.*' % bill_abbr[0])
        for bill_link in bill_list.findAll('a', href=bill_link_re):
            bill_id = bill_link.string
            bill_title = bill_link.parent.parent.findAll('td')[1].string
            bill_info_url = "http://www.leg.state.vt.us" + bill_link['href']

            bill = Bill(session, chamber, bill_id, bill_title)
            bill.add_source(bill_info_url)

            info_page = BeautifulSoup(self.urlopen(bill_info_url))

            text_links = info_page.findAll('blockquote')[-1].findAll('a')
            for text_link in text_links:
                bill.add_version(text_link.string,
                                 "http://www.leg.state.vt.us" +
                                 text_link['href'])

            sponsors = info_page.find(
                text='Sponsor(s):').parent.findNext('td').findAll('b')
            bill.add_sponsor('primary', sponsors[0].string)
            for sponsor in sponsors[1:]:
                bill.add_sponsor('cosponsor', sponsor.string)

            # Grab actions from the originating chamber
            act_table = info_page.find(
                text='%s Status:' % chamber_name).findNext('table')
            for row in act_table.findAll('tr')[3:]:
                action = clean_action(row.td.string.replace(
                        '&nbsp;', '').strip(':'))

                act_date = row.findAll('td')[1].b.string.replace('&nbsp;', '')
                if act_date != "":
                    detail = row.findAll('td')[2].b
                    if detail and detail.string != "":
                        action += ": %s" % detail.string.replace('&nbsp;', '')
                    bill.add_action(chamber, action, act_date,
                                    type=action_type(action))

            # Grab actions from the other chamber
            act_table = info_page.find(
                text='%s Status:' % other_chamber).findNext('table')
            if act_table:
                if chamber == 'upper':
                    act_chamber = 'lower'
                else:
                    act_chamber = 'upper'
                for row in act_table.findAll('tr')[3:]:
                    action = clean_action(row.td.string.replace(
                            '&nbsp;', '').strip(':'))

                    act_date = row.findAll('td')[1].b.string.replace(
                        '&nbsp;', '')
                    if act_date != "":
                        detail = row.findAll('td')[2].b
                        if detail and detail.string != "":
                            action += ": %s" % detail.string.replace(
                                '&nbsp;', '')
                        date = dt.datetime.strptime(act_date, '%m/%d/%Y')
                        bill.add_action(act_chamber, action, act_date,
                                        type=action_type(action))

            self.save_bill(bill)
Ejemplo n.º 15
0
    def scrape_session_new(self, chamber, session):
        if chamber == "lower":
            bill_abbr = "H."
        else:
            bill_abbr = "S."

        bill_list_path = "docs/bills.cfm?Session=%s&Body=%s" % (
            session.split('-')[1], bill_abbr[0])
        bill_list_url = "http://www.leg.state.vt.us/" + bill_list_path
        bill_list = BeautifulSoup(self.urlopen(bill_list_url))

        bill_link_re = re.compile('.*?Bill=%s\.\d+.*' % bill_abbr[0])
        for bill_link in bill_list.findAll('a', href=bill_link_re):
            bill_id = bill_link.string
            bill_title = bill_link.parent.findNext('b').string
            bill_info_url = "http://www.leg.state.vt.us" + bill_link['href']

            bill = Bill(session, chamber, bill_id, bill_title)
            bill.add_source(bill_info_url)

            info_page = BeautifulSoup(self.urlopen(bill_info_url))

            text_links = info_page.findAll('blockquote')[1].findAll('a')
            for text_link in text_links:
                bill.add_version(text_link.string,
                                 "http://www.leg.state.vt.us" +
                                 text_link['href'])

            act_table = info_page.findAll('blockquote')[2].table
            for row in act_table.findAll('tr')[1:]:
                action = ""
                for s in row.findAll('td')[1].findAll(text=True):
                    action += s + " "
                action = clean_action(action)

                match = re.search('Governor on (.*)$', action)
                if match:
                    act_date = parse_exec_date(match.group(1).strip())
                    actor = 'Governor'
                else:
                    if row['bgcolor'] == 'Salmon':
                        actor = 'lower'
                    else:
                        actor = 'upper'

                    if row.td.a:
                        act_date = row.td.a.string
                    else:
                        act_date = row.td.string

                    try:
                        act_date = re.search(
                            '\d{1,2}/\d{1,2}/\d{4,4}', act_date).group(0)
                    except AttributeError:
                        # No date, skip
                        continue

                    act_date = dt.datetime.strptime(act_date, '%m/%d/%Y')

                bill.add_action(actor, action, act_date,
                                type=action_type(action))

                vote_link = row.find('a', text='Details')
                if vote_link:
                    vote_url = vote_link.parent['href']
                    self.parse_vote_new(bill, actor, vote_url)

            sponsors = info_page.find(
                text='Sponsor(s):').parent.parent.findAll('b')
            bill.add_sponsor('primary', sponsors[0].string)
            for sponsor in sponsors[1:]:
                bill.add_sponsor('cosponsor', sponsor.string)

            self.save_bill(bill)
Ejemplo n.º 16
0
 def scrape(self, chamber, session):    
     bill_search_url = 'http://www.camaraderepresentantes.org/cr_buscar.asp'
     bill_types = {'Project':'P', 'Resolution':'R', \
                          'Joint Resolution':'RC', \
                          'Concurrent Resolution':'RK', \
                          'Appointment':'N'}
     #bodies = {'upper':'S', 'lower':'C'}
     bodies = {'upper':'S'}
     
     bill_search_page = lxml.html.parse(bill_search_url).getroot()
     search_form = bill_search_page.forms[0]
     
     for body in bodies.itervalues(): 
         for bill_type in bill_types.itervalues():   
             search_form.fields['cuerpo'] = body
             search_form.fields['tipo'] = bill_type
             search_form.fields['autor'] = 'NA'
             
             if year_from_session(session) == '2009':
                 search_form.fields['f2'] = '12/31/2009'
             elif year_from_session(session) == '2010':
                 search_form.fields['f1'] = '01/01/2010'
             
             result = lxml.html.parse(lxml.html.submit_form(search_form)).getroot()
             table_elements = result.cssselect('table')
             table_elements.pop()
             bill_elements = grouper(3, table_elements)
             
             for actions, complete_data, bill_data in bill_elements:
                 td_elements = bill_data.cssselect('td')  
                 title = td_elements[1].text_content()
                 date = td_elements[3].text_content()
                 description = td_elements[5].text_content()
                 authors = td_elements[7].text_content().split('/')
                 
                 bill = Bill(session, chamber, title, description)
                 
                 for author in authors:
                     if len(authors) == 1:
                         bill.add_sponsor('primary', author)
                     else:
                         bill.add_sponsor('cosponsor', author)
      
                 td_elements = actions.cssselect('td')
                 td_elements = td_elements[4:-1]                   
                 action_elements = grouper(3, td_elements)
                 
                 for date_element, action, empty in action_elements:
                     # Clean unicode character
                     date_text = date_element.text_content().replace(u'\xa0',u'')
                     date = dt.datetime.strptime(date_text, '%m/%d/%Y')
                     action_text = action.text_content()
                     try:
                         doc_link_part = action.iterlinks().next()[2]
                         if 'voto' in doc_link_part:
                             raise                           
                         doc_link = doc_link_url(doc_link_part)
                         bill.add_version((action_text, doc_link))
                     except:
                         pass
                 
                     bill.add_action(chamber, action_text, date)
Ejemplo n.º 17
0
    def scrape_session(self, chamber, year):
        if chamber == 'upper':
            bill_abbr = 'SB|SCR|SJR'
        elif chamber == 'lower':
            bill_abbr = 'HB|HCR|HJR'

        # Sessions last 2 years, 1993-1994 was the 18th
        session = str(18 + ((int(year) - 1993) / 2))
        year2 = str(int(year) + 1)

        # Full calendar year
        date1 = '0101' + year[2:]
        date2 = '1231' + year2[2:]

        # Get bill list
        bill_list_url = 'http://www.legis.state.ak.us/'\
            'basis/range_multi.asp?session=%s&date1=%s&date2=%s' % (
            session, date1, date2)
        self.log("Getting bill list for %s %s (this may take a long time)." %
                 (chamber, session))
        bill_list = self.soup_parser(self.urlopen(bill_list_url))

        # Find bill links
        re_str = "bill=%s\d+" % bill_abbr
        links = bill_list.findAll(href=re.compile(re_str))

        for link in links:
            bill_id = link.contents[0].replace(' ', '')
            bill_name = link.parent.parent.findNext('td').find(
                'font').contents[0].strip()
            bill = Bill(session, chamber, bill_id, bill_name.strip())

            # Get the bill info page and strip malformed t
            info_url = "http://www.legis.state.ak.us/basis/%s" % link['href']
            info_page = self.soup_parser(self.urlopen(info_url))
            bill.add_source(info_url)

            # Get sponsors
            spons_str = info_page.find(
                text="SPONSOR(s):").parent.parent.contents[1]
            sponsors_match = re.match(
                ' (SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})',
                spons_str)
            if sponsors_match:
                sponsors = sponsors_match.group(2).split(',')
                bill.add_sponsor('primary', sponsors[0].strip())

                for sponsor in sponsors[1:]:
                    bill.add_sponsor('cosponsor', sponsor.strip())
            else:
                # Committee sponsorship
                bill.add_sponsor('committee', spons_str.strip())

            # Get actions
            act_rows = info_page.findAll('table', 'myth')[1].findAll('tr')[1:]
            for row in act_rows:
                cols = row.findAll('td')
                act_date = cols[0].font.contents[0]
                act_date = dt.datetime.strptime(act_date, '%m/%d/%y')

                if cols[2].font.string == "(H)":
                    act_chamber = "lower"
                elif cols[2].font.string == "(S)":
                    act_chamber = "upper"
                else:
                    act_chamber = chamber

                action = cols[3].font.contents[0].strip()
                if re.match("\w+ Y(\d+) N(\d+)", action):
                    try:
                        vote = self.parse_vote(bill, action,
                                               act_chamber, act_date,
                                               cols[1].a['href'])
                        bill.add_vote(vote)
                    except:
                        self.log("Failed parsing vote at %s" %
                                 cols[1].a['href'])

                bill.add_action(act_chamber, action, act_date)

            # Get subjects
            bill['subjects'] = []
            subject_link_re = re.compile('.*subject=\w+$')
            for subject_link in info_page.findAll('a', href=subject_link_re):
                subject = subject_link.contents[0].strip()
                bill['subjects'].append(subject)

            # Get versions
            text_list_url = "http://www.legis.state.ak.us/"\
                "basis/get_fulltext.asp?session=%s&bill=%s" % (
                session, bill_id)
            text_list = self.soup_parser(self.urlopen(text_list_url))
            bill.add_source(text_list_url)

            text_link_re = re.compile('^get_bill_text?')
            for text_link in text_list.findAll('a', href=text_link_re):
                text_name = text_link.parent.previousSibling.contents[0]
                text_name = text_name.strip()

                text_url = "http://www.legis.state.ak.us/basis/%s" % (
                    text_link['href'])

                bill.add_version(text_name, text_url)

            self.save_bill(bill)
Ejemplo n.º 18
0
    def scrape_bill_type(self, chamber, session, bill_type, type_abbr):
        if chamber == 'upper':
            chamber_name = 'SENATE'
        else:
            chamber_name = 'ASSEMBLY'


        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(
            measure_type=type_abbr)


        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_session, chamber, bill_id, '')

            title = ''
            short_title = ''
            type = ['bill']
            subject = ''
            for version in self.session.query(CABillVersion).filter_by(
                bill=bill).filter(CABillVersion.bill_xml != None):

                title = version.title
                short_title = version.short_title
                type = [bill_type]

                if version.appropriation == 'Yes':
                    type.append('appropriation')
                if version.fiscal_committee == 'Yes':
                    type.append('fiscal committee')
                if version.local_program == 'Yes':
                    type.append('local program')
                if version.urgency == 'Yes':
                    type.append('urgency')
                if version.taxlevy == 'Yes':
                    type.append('tax levy')

                subject = version.subject

                fsbill.add_version(version.bill_version_id, '',
                                   date=version.bill_version_action_date,
                                   title=version.title,
                                   short_title=version.short_title,
                                   subject=[subject],
                                   type=type)

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill['title'] = title
            fsbill['short_title'] = short_title
            fsbill['type'] = type
            fsbill['subjects'] = [subject]

            for author in version.authors:
                if author.house == chamber_name:
                    fsbill.add_sponsor(author.contribution, author.name)

            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {'Assembly': 'lower',
                             'Senate': 'upper'}[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'executive'
                else:
                    actor = re.sub('^Assembly', 'lower', actor)
                    actor = re.sub('^Senate', 'upper', actor)

                type = []

                act_str = action.action
                if act_str.startswith('Introduced'):
                    type.append('bill:introduced')

                if 'To Com' in act_str:
                    type.append('committee:referred')

                if 'Read third time.  Passed.' in act_str:
                    type.append('bill:passed')

                if 'Approved by Governor' in act_str:
                    type.append('governor:signed')

                if 'Item veto' in act_str:
                    type.append('governor:vetoed:line-item')

                if not type:
                    type = ['other']

                fsbill.add_action(actor, act_str, action.action_date,
                                  type=type)

            for vote in bill.votes:
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                motion = vote.motion.motion_text or ''

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = 'passage'
                elif "Do Pass" in motion:
                    vtype = 'passage'
                else:
                    vtype = 'other'

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r'(\w+)( Extraordinary)? Session$',
                                    re.IGNORECASE).sub('', motion)
                motion = re.compile(r'^(Senate|Assembly) ',
                                    re.IGNORECASE).sub('', motion)
                motion = re.sub(r'^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ',
                                '', motion)
                motion = re.sub(r' \(\w+\)$', '', motion)
                motion = re.sub(r'(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$',
                                '', motion)
                motion = re.sub(r'(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? '
                                r'Urgency Clause$',
                                '(Urgency Clause)', motion)
                motion = re.sub(r'\s+', ' ', motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                fsvote = Vote(vote_chamber,
                              vote.vote_date_time,
                              motion,
                              result,
                              int(vote.ayes),
                              int(vote.noes),
                              int(vote.abstain),
                              threshold=vote.threshold,
                              type=vtype)

                if vote_location != 'Floor':
                    fsvote['committee'] = vote_location

                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        fsvote.yes(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        fsvote.no(record.legislator_name)
                    else:
                        fsvote.other(record.legislator_name)

                fsbill.add_vote(fsvote)

            self.save_bill(fsbill)
Ejemplo n.º 19
0
    def parse_bill_xml(self, chamber, session, txt):
        root = lxml.etree.fromstring(txt)
        bill_id = ' '.join(root.attrib['bill'].split(' ')[1:])
        bill_title = root.findtext("caption")

        if session[2] == 'R':
            session = session[0:2]

        if bill_id[1] == 'B':
            bill_type = ['bill']
        elif bill_id[1] == 'R':
            bill_type = ['resolution']
        elif bill_id[1:3] == 'CR':
            bill_type = ['concurrent resolution']
        elif bill_id[1:3] == 'JR':
            bill_type = ['joint resolution']
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)

        for action in root.findall('actions/action'):
            act_date = dt.datetime.strptime(action.findtext('date'),
                                            "%m/%d/%Y").date()

            extra = {}
            extra['action_number'] = action.find('actionNumber').text
            comment = action.find('comment')
            if comment is not None and comment.text:
                extra['comment'] = comment.text.strip()

            actor = {'H': 'lower',
                     'S': 'upper',
                     'E': 'executive'}[extra['action_number'][0]]

            desc = action.findtext('description').strip()

            if desc == 'Amended':
                type = 'amendment:passed'
            elif desc == 'Amendment(s) offered':
                type = 'amendment:introduced'
            elif desc == 'Amendment amended':
                type = 'amendment:amended'
            elif desc == 'Amendment withdrawn':
                type = 'amendment:withdrawn'
            elif desc == 'Passed':
                type = 'bill:passed'
            elif desc.startswith('Received from the'):
                type = 'bill:introduced'
            elif desc.startswith('Sent to the Governor'):
                # But what if it gets lost in the mail?
                type = 'governor:received'
            elif desc.startswith('Signed by the Governor'):
                type = 'governor:signed'
            elif desc == 'Read first time':
                type = 'bill:introduced'
            else:
                type = 'other'

            bill.add_action(actor, action.findtext('description'),
                            act_date, type=type, **extra)

        for author in root.findtext('authors').split(' | '):
            if author != "":
                bill.add_sponsor('author', author)
        for coauthor in root.findtext('coauthors').split(' | '):
            if coauthor != "":
                bill.add_sponsor('coauthor', coauthor)
        for sponsor in root.findtext('sponsors').split(' | '):
            if sponsor != "":
                bill.add_sponsor('sponsor', sponsor)
        for cosponsor in root.findtext('cosponsors').split(' | '):
            if cosponsor != "":
                bill.add_sponsor('cosponsor', cosponsor)

        bill['subjects'] = []
        for subject in root.iterfind('subjects/subject'):
            bill['subjects'].append(subject.text.strip())

        return bill
Ejemplo n.º 20
0
    def scrape_bill_pages(self, session, year_abr):

        #Main Bill information
        main_bill_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/MAINBILL.DBF' % (year_abr)
        MAINBILL_dbf, resp = self.urlretrieve(main_bill_url)
        main_bill_db = dbf.Dbf(MAINBILL_dbf)
        bill_dict = {}

        for rec in main_bill_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"
            bill = Bill(str(session), chamber, bill_id, title)
            bill.add_source(main_bill_url)
            bill_dict[bill_id] = bill

        #Sponsors
        bill_sponsors_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/BILLSPON.DBF' % (year_abr)
        SPONSORS_dbf, resp = self.urlretrieve(bill_sponsors_url)
        bill_sponsors_db = dbf.Dbf(SPONSORS_dbf)

        for rec in bill_sponsors_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            name = rec["sponsor"]
            sponsor_type = rec["type"]
            if sponsor_type == 'P':
                sponsor_type = "Primary"
            else:
                sponsor_type = "Co-sponsor"
            bill.add_sponsor(sponsor_type, name)


        #Documents
        bill_document_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/BILLWP.DBF' % (year_abr)
        DOC_dbf, resp = self.urlretrieve(bill_document_url)
        bill_document_db = dbf.Dbf(DOC_dbf)
        
        #print bill_document_db[2]
        for rec in bill_document_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            document = rec["document"]
            document = document.split('\\')
            doc_name = document[-1]
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))
            doc_url = "ftp://www.njleg.state.nj.us/%s" % year
            doc_url = doc_url + "/" + document
            bill.add_document(doc_name, doc_url)

        #Senate Votes
        file1 = 'A' + str(year_abr)
        file2 = 'A' + str(year_abr + 1)
        file3 = 'S' + str(year_abr)
        file4 = 'S' + str(year_abr + 1)
        if str(year_abr) != '2010':
            vote_info_list = [file1, file2, file3, file4]
        else:
            vote_info_list = [file1, file3]
        for bill_vote_file in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % bill_vote_file
            s_vote_zip, resp = self.urlretrieve(s_vote_url)
            zipedfile = zipfile.ZipFile(s_vote_zip)
            vfile = "%s.txt" % bill_vote_file
            vote_file = zipedfile.open(vfile, 'U')
            vdict_file = csv.DictReader(vote_file)

            votes = {}
            if bill_vote_file[0] == "A":
                chamber = "lower"
            else:
                chamber = "upper"

            for rec in vdict_file:
                bill_id = rec["Bill"]
                bill_id = bill_id.strip()
                leg = rec["Full_Name"]

                date = rec["Session_Date"]
                date = datetime.strptime(date, "%m/%d/%Y")
                action = rec["Action"]
                leg_vote = rec["Legislator_Vote"]
                vote_id = bill_id + "_" + action
                vote_id = vote_id.replace(" ", "_")
                passed = None
                
                if vote_id not in votes:
                    votes[vote_id] = Vote(chamber, date, action, passed, None, None, None, bill_id = bill_id)
                if leg_vote == "Y":
                    votes[vote_id].yes(leg)
                elif leg_vote == "N":
                    votes[vote_id].no(leg)
                else:
                    votes[vote_id].other(leg)

            #Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count
                if vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        #Actions
        bill_action_url = 'ftp://www.njleg.state.nj.us/ag/%sdata/BILLHIST.DBF' % (year_abr)
        ACTION_dbf, resp = self.urlretrieve(bill_action_url)
        bill_action_db = dbf.Dbf(ACTION_dbf)
        bill.add_source(bill_sponsors_url)
        bill.add_source(bill_document_url)
        bill.add_source(bill_action_url)

        for rec in bill_action_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            action = rec["action"]
            date = rec["dateaction"]
            actor = rec["house"]
            comment = rec["comment"]
            bill.add_action(actor, action, date, comment = comment)
            self.save_bill(bill)
Ejemplo n.º 21
0
    def scrape_bill(self, chamber, session, url):
        url = url + "&Year=%s" % session
        with self.urlopen(url) as page:
            page = page.replace('&nbsp;', ' ').replace('<br>', '\n')
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            title = page.xpath('//h3')[0].text.strip()
            title = re.match(r"^\w+\s+\d+:\s+(.*)$", title).group(1)

            bill_id = page.xpath("string(//pre[@class='billhistory']/b)")
            bill_id = bill_id.split()[0].strip()

            bill = Bill(session, chamber, bill_id, title)
            bill.add_source(url)

            hist = page.xpath("string(//pre[@class='billhistory'])").strip()
            act_re = re.compile(r'^  (\d\d/\d\d/\d\d) (SENATE|HOUSE)'
                                r'(.*\n(\s{16,16}.*\n){0,})',
                                re.MULTILINE)

            # Actions
            for match in act_re.finditer(hist):
                action = match.group(3).replace('\n', ' ')
                action = re.sub(r'\s+', ' ', action).strip()

                if match.group(2) == 'SENATE':
                    actor = 'upper'
                else:
                    actor = 'lower'

                date = match.group(1)
                date = datetime.datetime.strptime(date, "%m/%d/%y")

                for act_text in re.split(' -[HS]J \d+;? ?', action):
                    act_text = act_text.strip()
                    if not act_text:
                        continue

                    types = []
                    act_lower = act_text.lower()
                    if act_lower.startswith('introduced'):
                        types.append('bill:introduced')
                    if 'referred to' in act_lower:
                        types.append('committee:referred')
                    if 'died in committee' in act_lower:
                        types.append('committee:failed')
                    if 'favorable by' in act_lower:
                        types.append('committee:passed:favorable')
                    if 'amendment(s) adopted' in act_lower:
                        types.append('amendment:passed')

                    bill.add_action(actor, act_text, date, type=types)

            # Sponsors
            primary_sponsor = re.search(r'by ([^;(\n]+;?|\w+)',
                                        hist).group(1).strip('; ')
            bill.add_sponsor('primary', primary_sponsor)

            cospon_re = re.compile(r'\((CO-SPONSORS|CO-AUTHORS)\) '
                                   '([\w .]+(;[\w .\n]+){0,})',
                                   re.MULTILINE)
            match = cospon_re.search(hist)

            if match:
                for cosponsor in match.group(2).split(';'):
                    cosponsor = cosponsor.replace('\n', '').strip()
                    bill.add_sponsor('cosponsor', cosponsor)

            # Versions
            for link in page.xpath("//a[contains(@href, 'billtext/html')]"):
                version = link.xpath('string(../../td[1])').strip()

                bill.add_version(version, link.attrib['href'])

            # House Votes
            for link in page.xpath("//a[contains(@href, 'votes/html/h')]"):
                bill.add_vote(self.scrape_lower_vote(link.attrib['href']))

            # Senate Votes
            for link in page.xpath("//a[contains(@href, 'votes/html/S')]"):
                bill.add_vote(self.scrape_upper_vote(link.attrib['href']))

            self.save_bill(bill)
Ejemplo n.º 22
0
    def scrape_new_session(self, chamber, session):
        """
        Scrapes SD's bill data from 2009 on.
        """

        if chamber == 'upper':
            bill_abbr = 'SB'
        elif chamber == 'lower':
            bill_abbr = 'HB'

        # Get bill list page
        session_url = 'http://legis.state.sd.us/sessions/%s/' % session
        bill_list_url = session_url + 'BillList.aspx'
        self.log('Getting bill list for %s %s' % (chamber, session))
        bill_list = self.soup_parser(self.urlopen(bill_list_url))

        # Format of bill link contents
        bill_re = re.compile(u'%s\xa0(\d+)' % bill_abbr)
        date_re = re.compile('\d{2}/\d{2}/\d{4}')

        for bill_link in bill_list.findAll('a'):
            if len(bill_link.contents) == 0:
                # Empty link
                continue

            #print bill_link.contents[0]
            bill_match = bill_re.search(bill_link.contents[0])
            if not bill_match:
                continue

            # Parse bill ID and name
            bill_id = bill_link.contents[0].replace(u'\xa0', ' ')
            bill_name = bill_link.findNext().contents[0]

            # Download history page
            hist_url = session_url + bill_link['href']
            history = self.soup_parser(self.urlopen(hist_url))

            bill = Bill(session, chamber, bill_id, bill_name)
            bill.add_source(hist_url)

            # Get all bill versions
            text_table = history.findAll('table')[1]
            for row in text_table.findAll('tr')[2:]:
                #version_date = row.find('td').string
                version_path = row.findAll('td')[1].a['href']
                version_url = "http://legis.state.sd.us/sessions/%s/%s" % (
                    session, version_path)

                version_name = row.findAll('td')[1].a.contents[0].strip()

                bill.add_version(version_name, version_url)

            # Get actions
            act_table = history.find('table')
            for act_row in act_table.findAll('tr')[6:]:
                if act_row.find(text='Action'):
                    continue

                # Get the date (if can't find one then this isn't an action)
                date_match = date_re.match(act_row.td.a.contents[0])
                if not date_match:
                    continue
                act_date = date_match.group(0)
                act_date = dt.datetime.strptime(act_date, "%m/%d/%Y")

                # Get the action string
                action = ""
                for node in act_row.findAll('td')[1].contents:
                    if hasattr(node, 'contents'):
                        action += node.contents[0]

                        if node.contents[0].startswith('YEAS'):
                            # This is a vote!
                            vote_url = "http://legis.state.sd.us/sessions/"\
                                "%s/%s" % (session, node['href'])

                            vote = self.scrape_new_vote(vote_url)
                            vote['date'] = act_date
                            bill.add_vote(vote)
                    else:
                        action += node
                action = action.strip()

                # Add action
                bill.add_action(chamber, action, act_date)

            self.save_bill(bill)
Ejemplo n.º 23
0
    def scrape_bill(self, chamber, session, bill_id, title, url):
        with self.urlopen(url) as page:
            page = lxml.html.fromstring(page)
            page.make_links_absolute(url)

            bill = Bill(session, chamber, bill_id, title)
            bill.add_source(url)

            actor = chamber

            for row in page.xpath(
                "//table[contains(@id, 'BillActions')]/tr")[6:]:

                action = row.xpath("string(td[2])").strip()
                if action == 'Action':
                    continue

                atypes = []
                if action.startswith('First read'):
                    atypes.append('bill:introduced')
                elif action.startswith('Signed by Governor'):
                    atypes.append('governor:signed')

                match = re.match(r'(.*) Do Pass( Amended)?, (Passed|Failed)',
                                 action)
                if match:
                    if match.group(1) in ['Senate',
                                          'House of Representatives']:
                        first = 'bill'
                    else:
                        first = 'committee'
                    atypes.append("%s:%s" % (first, match.group(3).lower()))

                if 'referred to' in action.lower():
                    atypes.append('committee:referred')

                if 'Motion to amend, Passed Amendment' in action:
                    atypes.append('amendment:introduced')
                    atypes.append('amendment:passed')

                if 'Veto override, Passed' in action:
                    atypes.append('bill:veto_override:passed')
                elif 'Veto override, Failed' in action:
                    atypes.append('bill:veto_override:failed')

                match = re.match("First read in (Senate|House)", action)
                if match:
                    if match.group(1) == 'Senate':
                        actor = 'upper'
                    else:
                        actor = 'lower'

                date = row.xpath("string(td[1])").strip()
                match = re.match('\d{2}/\d{2}/\d{4}', date)
                if not match:
                    self.warning("Bad date: %s" % date)
                    continue
                date = datetime.datetime.strptime(date, "%m/%d/%Y").date()

                for link in row.xpath("td[2]/a[contains(@href, 'RollCall')]"):
                    self.scrape_vote(bill, date, link.attrib['href'])

                bill.add_action(actor, action, date, type=atypes)

            self.save_bill(bill)
Ejemplo n.º 24
0
    def scrape_old_session(self, chamber, session):
        """
        Scrape SD's bill data from 1997 through 2008.
        """

        if chamber == 'upper':
            bill_abbr = 'SB'
        else:
            bill_abbr = 'HB'

        # Get bill list page (and replace malformed tags that some versions of
        # BeautifulSoup choke on)
        session_url = 'http://legis.state.sd.us/sessions/%s/' % session
        bill_list_url = session_url + 'billlist.htm'
        bill_list = self.soup_parser(self.urlopen(bill_list_url))

        # Bill and text link formats
        bill_re = re.compile('%s (\d+)' % bill_abbr)
        text_re = re.compile('/sessions/%s/bills/%s.*\.htm' % (
                session, bill_abbr), re.IGNORECASE)
        date_re = re.compile('\d{2}/\d{2}/\d{4}')

        for bill_link in bill_list.findAll('a', href=re.compile('\d\.htm$')):
            if len(bill_link.contents) == 0:
                # Empty link
                continue

            bill_match = bill_re.match(bill_link.contents[0])
            if not bill_match:
                # Not bill link
                continue

            # Get the bill ID and name
            bill_id = bill_link.contents[0]
            bill_name = bill_link.findNext().contents[0]

            # Get history page (replacing malformed tag)
            hist_url = session_url + bill_link['href']
            history = self.soup_parser(self.urlopen(hist_url))

            # Get URL of latest verion of bill (should be listed last)
            bill_url = history.findAll('a', href=text_re)[-1]['href']
            bill_url = 'http://legis.state.sd.us%s' % bill_url

            # Add bill
            bill = Bill(session, chamber, bill_id, bill_name)
            bill.add_source(hist_url)

            # Get bill versions
            text_table = history.findAll('table')[1]
            for row in text_table.findAll('tr')[2:]:
                #version_date = row.find('td').string
                version_path = row.findAll('td')[1].a['href']
                version_url = "http://legis.state.sd.us" + version_path

                version_name = row.findAll('td')[1].a.contents[0].strip()

                bill.add_version(version_name, version_url)

            # Get actions
            act_table = history.find('table')
            for act_row in act_table.findAll('tr')[6:]:
                if act_row.find(text="Action"):
                    continue

                # Get the date (if can't find one then this isn't an action)
                date_match = date_re.match(act_row.td.a.contents[0])
                if not date_match:
                    continue
                act_date = date_match.group(0)
                act_date = dt.datetime.strptime(act_date, "%m/%d/%Y")

                # Get the action string
                action = ""
                for node in act_row.findAll('td')[1].contents:
                    if hasattr(node, 'contents'):
                        action += node.contents[0]

                        if node.contents[0].startswith('YEAS'):
                            # This is a vote!
                            if node['href'][0] == '/':
                                vote_url = "http://legis.state.sd.us/%s" % (
                                    node['href'])
                            else:
                                vote_url = "http://legis.state.sd.us/"\
                                    "sessions/%s/%s" % (session, node['href'])

                            vote = self.scrape_old_vote(vote_url)
                            vote['date'] = act_date
                            bill.add_vote(vote)
                    else:
                        action += node
                action = action.strip()

                # Add action
                bill.add_action(chamber, action, act_date)

            self.save_bill(bill)
Ejemplo n.º 25
0
    def scrape_bill_info(self, session, ld, session_id, bill_id, title):
        bill_info_url  = 'http://www.mainelegislature.org/LawMakerWeb/summary.asp?LD=%s&SessionID=%s' % (ld, session_id)
        with self.urlopen(bill_info_url) as bill_sum_page:
            root = lxml.etree.fromstring(bill_sum_page, lxml.etree.HTMLParser())
            sponsor = root.xpath('string(//tr[3]/td[1]/b[1])')
            if bill_id[0] == "S":
                chamber = "upper"
            else:
                chamber = "lower"
            bill = Bill(str(session), chamber, bill_id, title)
            bill.add_source(bill_info_url)

            #Actions
            actions_url_addon = root.xpath('string(//table/tr[3]/td/a/@href)')
            actions_url = 'http://www.mainelegislature.org/LawMakerWeb/%s' % actions_url_addon
            bill.add_source(actions_url)
            with self.urlopen(actions_url) as actions_page:
                root2 = lxml.etree.fromstring(actions_page, lxml.etree.HTMLParser())
                count = 2
                for mr in root2.xpath("//td[2]/table[2]/tr[position() > 1]/td[1]"):
                    date = mr.xpath('string()')
                    date = datetime.strptime(date, "%m/%d/%Y")
                    actor_path = "string(//td[2]/table/tr[%s]/td[2])" % count
                    actor = root2.xpath(actor_path)
                    action_path = "string(//td[2]/table/tr[%s]/td[3])" % count
                    action = root2.xpath(action_path)
                    count = count + 1
                    if actor == "House":
                        actor = "lower"
                    else:
                        actor = "upper"
                    bill.add_action(actor, action, date)
            #Votes
            votes_url_addon = root.xpath('string(//table/tr[9]/td/a/@href)')
            votes_url = 'http://www.mainelegislature.org/LawMakerWeb/%s' % votes_url_addon
            bill.add_source(votes_url)
            with self.urlopen(votes_url) as votes_page:
                vote_root = lxml.etree.fromstring(votes_page, lxml.etree.HTMLParser())
                for mr in vote_root.xpath('//table[position() > 1]/tr/td/a'):
                    vote_detail_addon = mr.xpath('string(@href)')
                    vote_detail_url = 'http://www.mainelegislature.org/LawMakerWeb/%s' % vote_detail_addon
                    bill.add_source(vote_detail_url)
                    with self.urlopen(vote_detail_url) as vote_detail_page:
                        detail_root = lxml.etree.fromstring(vote_detail_page, lxml.etree.HTMLParser())
                        date = detail_root.xpath('string(//table[2]//tr[2]/td[3])')
                        try:
                            date = datetime.strptime(date, "%B %d, %Y")
                        except:
                            date = datetime.strptime(date, "%b. %d, %Y")
                        motion = detail_root.xpath('string(//table[2]//tr[3]/td[3])')
                        passed = detail_root.xpath('string(//table[2]//tr[5]/td[3])') == 'PREVAILS'
                        yes_count = detail_root.xpath('string(//table[2]//tr[6]/td[3])')
                        no_count = detail_root.xpath('string(//table[2]//tr[7]/td[3])')
                        absent_count = detail_root.xpath('string(//table[2]//tr[6]/td[3])')
                        excused_count = detail_root.xpath('string(//table[2]//tr[6]/td[3])')
                        other_count = 0

                        if votes_url.find('House') != -1:
                            chamber = "lower"
                        else:
                            chamber = "upper"

                        vote = Vote(chamber, date, motion, passed, int(yes_count), int(no_count), other_count, absent_count = int(absent_count), excused_count = int(excused_count))

                        for member in detail_root.xpath('//table[3]/tr[position() > 1]'):
                            leg = member.xpath('string(td[2])')
                            party = member.xpath('string(td[3])')
                            leg_vote = member.xpath('string(td[4])')

                            if leg_vote == "Y":
                                vote.yes(leg)
                            elif leg_vote == "N":
                                vote.no(leg)
                            else:
                                vote.other(leg)
                        bill.add_vote(vote)
            self.save_bill(bill)
Ejemplo n.º 26
0
    def scrape_bill_type(self, chamber, session, bill_type, type_abbr):
        if chamber == "upper":
            chamber_name = "SENATE"
        else:
            chamber_name = "ASSEMBLY"

        bills = self.session.query(CABill).filter_by(session_year=session).filter_by(measure_type=type_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != "0":
                bill_session += " Special Session %s" % bill.session_num

            bill_id = bill.short_bill_id

            fsbill = Bill(bill_session, chamber, bill_id, "")

            # Construct session for web query, going from '20092010' to '0910'
            source_session = session[2:4] + session[6:8]

            # Turn 'AB 10' into 'ab_10'
            source_num = "%s_%s" % (bill.measure_type.lower(), bill.measure_num)

            # Construct a fake source url
            source_url = "http://www.leginfo.ca.gov/cgi-bin/postquery?" "bill_number=%s&sess=%s" % (
                source_num,
                source_session,
            )

            fsbill.add_source(source_url)

            title = ""
            short_title = ""
            type = ["bill"]
            subject = ""
            for version in (
                self.session.query(CABillVersion).filter_by(bill=bill).filter(CABillVersion.bill_xml != None)
            ):

                title = version.title
                short_title = version.short_title
                type = [bill_type]

                if version.appropriation == "Yes":
                    type.append("appropriation")
                if version.fiscal_committee == "Yes":
                    type.append("fiscal committee")
                if version.local_program == "Yes":
                    type.append("local program")
                if version.urgency == "Yes":
                    type.append("urgency")
                if version.taxlevy == "Yes":
                    type.append("tax levy")

                subject = version.subject

                fsbill.add_version(
                    version.bill_version_id,
                    "",
                    date=version.bill_version_action_date.date(),
                    title=version.title,
                    short_title=version.short_title,
                    subject=[subject],
                    type=type,
                )

            if not title:
                self.warning("Couldn't find title for %s, skipping" % bill_id)
                continue

            fsbill["title"] = title
            fsbill["short_title"] = short_title
            fsbill["type"] = type
            fsbill["subjects"] = [subject]

            for author in version.authors:
                if author.house == chamber_name:
                    fsbill.add_sponsor(author.contribution, author.name)

            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r"(Assembly|Senate)($| \(Floor)", actor)
                if match:
                    actor = {"Assembly": "lower", "Senate": "upper"}[match.group(1)]
                elif actor.startswith("Governor"):
                    actor = "executive"
                else:
                    actor = re.sub("^Assembly", "lower", actor)
                    actor = re.sub("^Senate", "upper", actor)

                type = []

                act_str = action.action
                if act_str.startswith("Introduced"):
                    type.append("bill:introduced")

                if "To Com" in act_str:
                    type.append("committee:referred")

                if "Read third time.  Passed." in act_str:
                    type.append("bill:passed")

                if "Approved by Governor" in act_str:
                    type.append("governor:signed")

                if "Item veto" in act_str:
                    type.append("governor:vetoed:line-item")

                if not type:
                    type = ["other"]

                fsbill.add_action(actor, act_str, action.action_date.date(), type=type)

            for vote in bill.votes:
                if vote.vote_result == "(PASS)":
                    result = True
                else:
                    result = False

                full_loc = vote.location.description
                first_part = full_loc.split(" ")[0].lower()
                if first_part in ["asm", "assembly"]:
                    vote_chamber = "lower"
                    vote_location = " ".join(full_loc.split(" ")[1:])
                elif first_part.startswith("sen"):
                    vote_chamber = "upper"
                    vote_location = " ".join(full_loc.split(" ")[1:])
                else:
                    raise ScrapeError("Bad location: %s" % full_loc)

                motion = vote.motion.motion_text or ""

                if "Third Reading" in motion or "3rd Reading" in motion:
                    vtype = "passage"
                elif "Do Pass" in motion:
                    vtype = "passage"
                else:
                    vtype = "other"

                motion = motion.strip()

                # Why did it take until 2.7 to get a flags argument on re.sub?
                motion = re.compile(r"(\w+)( Extraordinary)? Session$", re.IGNORECASE).sub("", motion)
                motion = re.compile(r"^(Senate|Assembly) ", re.IGNORECASE).sub("", motion)
                motion = re.sub(r"^(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.?  ", "", motion)
                motion = re.sub(r" \(\w+\)$", "", motion)
                motion = re.sub(r"(SCR|SB|AB|AJR|ACR)\s?\d+ \w+\.?$", "", motion)
                motion = re.sub(r"(SCR|SJR|SB|AB|AJR|ACR)\s?\d+ \w+\.? " r"Urgency Clause$", "(Urgency Clause)", motion)
                motion = re.sub(r"\s+", " ", motion)

                if not motion:
                    self.warning("Got blank motion on vote for %s" % bill_id)
                    continue

                fsvote = Vote(
                    vote_chamber,
                    self._tz.localize(vote.vote_date_time),
                    motion,
                    result,
                    int(vote.ayes),
                    int(vote.noes),
                    int(vote.abstain),
                    threshold=vote.threshold,
                    type=vtype,
                )

                if vote_location != "Floor":
                    fsvote["committee"] = vote_location

                for record in vote.votes:
                    if record.vote_code == "AYE":
                        fsvote.yes(record.legislator_name)
                    elif record.vote_code.startswith("NO"):
                        fsvote.no(record.legislator_name)
                    else:
                        fsvote.other(record.legislator_name)

                fsbill.add_vote(fsvote)

            self.save_bill(fsbill)
Ejemplo n.º 27
0
    def scrape(self, chamber, year):
        session = "%s%d" % (year, int(year) + 1)
        if session not in [s_ for t in metadata['terms']
                           for s_ in t['sessions']]:
            raise NoDataForPeriod(year)

        if chamber == 'upper':
            measure_abbr = 'SB'
            chamber_name = 'SENATE'
            house_type = 'S'
        else:
            measure_abbr = 'AB'
            chamber_name = 'ASSEMBLY'
            house_type = 'A'

        bills = self.session.query(CABill).filter_by(
            session_year=session).filter_by(
            measure_type=measure_abbr)

        for bill in bills:
            bill_session = session
            if bill.session_num != '0':
                bill_session += ' Special Session %s' % bill.session_num

            bill_id = bill.short_bill_id
            version = self.session.query(CABillVersion).filter_by(
                bill=bill).filter(CABillVersion.bill_xml != None).first()
            if not version:
                # not enough data to import
                continue

            fsbill = Bill(bill_session, chamber, bill_id,
                          version.title,
                          short_title=version.short_title)

            for author in version.authors:
                if author.house == chamber_name:
                    fsbill.add_sponsor(author.contribution, author.name)

            for action in bill.actions:
                if not action.action:
                    # NULL action text seems to be an error on CA's part,
                    # unless it has some meaning I'm missing
                    continue
                actor = action.actor or chamber
                actor = actor.strip()
                match = re.match(r'(Assembly|Senate)($| \(Floor)', actor)
                if match:
                    actor = {'Assembly': 'lower',
                             'Senate': 'upper'}[match.group(1)]
                elif actor.startswith('Governor'):
                    actor = 'executive'
                else:
                    actor = re.sub('^Assembly', 'lower', actor)
                    actor = re.sub('^Senate', 'upper', actor)

                type = []

                act_str = action.action
                if act_str.startswith('Introduced'):
                    type.append('bill:introduced')

                if 'To Com' in act_str:
                    type.append('committee:referred')

                if 'Read third time.  Passed.' in act_str:
                    type.append('bill:passed')

                if 'Approved by Governor' in act_str:
                    type.append('bill:signed')

                if 'Item veto' in act_str:
                    type.append('veto:line-item')

                if not type:
                    type = ['other']

                fsbill.add_action(actor, act_str, action.action_date,
                                  type=type)

            for vote in bill.votes:
                if vote.vote_result == '(PASS)':
                    result = True
                else:
                    result = False

                full_loc = vote.location.description
                first_part = full_loc.split(' ')[0].lower()
                if first_part in ['asm', 'assembly']:
                    vote_chamber = 'lower'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                elif first_part.startswith('sen'):
                    vote_chamber = 'upper'
                    vote_location = ' '.join(full_loc.split(' ')[1:])
                else:
                    vote_chamber = ''
                    vote_location = full_loc

                fsvote = Vote(vote_chamber,
                              vote.vote_date_time,
                              vote.motion.motion_text or '',
                              result,
                              vote.ayes, vote.noes, vote.abstain,
                              threshold=vote.threshold,
                              location=vote_location)

                for record in vote.votes:
                    if record.vote_code == 'AYE':
                        fsvote.yes(record.legislator_name)
                    elif record.vote_code.startswith('NO'):
                        fsvote.no(record.legislator_name)
                    else:
                        fsvote.other(record.legislator_name)

                fsbill.add_vote(fsvote)

            self.save_bill(fsbill)
Ejemplo n.º 28
0
    def scrape_bill_pages(self, session, year_abr):
        """ assemble information on a bill from a number of DBF files
        """

        #Main Bill information
        main_bill_url, main_bill_db = self.get_dbf(year_abr, 'MAINBILL')

        # keep a dictionary of bills (mapping bill_id to Bill obj)
        bill_dict = {}

        for rec in main_bill_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            title = rec["synopsis"]
            if bill_type[0] == 'A':
                chamber = "lower"
            else:
                chamber = "upper"

            bill = Bill(str(session), chamber, bill_id, title,
                        type=self._bill_types[bill_type[1:]])
            bill.add_source(main_bill_url)
            bill_dict[bill_id] = bill

        #Sponsors
        bill_sponsors_url, bill_sponsors_db = self.get_dbf(year_abr, 'BILLSPON')

        for rec in bill_sponsors_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            name = rec["sponsor"]
            sponsor_type = rec["type"]
            if sponsor_type == 'P':
                sponsor_type = "Primary"
            else:
                sponsor_type = "Co-sponsor"
            bill.add_sponsor(sponsor_type, name)


        #Documents
        bill_document_url, bill_document_db = self.get_dbf(year_abr, 'BILLWP')

        #print bill_document_db[2]
        for rec in bill_document_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            document = rec["document"]
            document = document.split('\\')
            document = document[-2] + "/" + document[-1]
            year = str(year_abr) + str((year_abr + 1))

            #doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document)
            htm_url = 'http://www.njleg.state.nj.us/%s/Bills/%s' % (year_abr,
                document.replace('.DOC', '.HTM'))

            # name document based _doctype
            doc_name = self._doctypes[rec['doctype']]
            if rec['comment']:
                doc_name += ' ' + rec['comment']

            if rec['doctype'] in self._version_types:
                bill.add_version(doc_name, htm_url)
            else:
                bill.add_document(doc_name, htm_url)

        #Senate Votes
        file1 = 'A' + str(year_abr)
        file2 = 'A' + str(year_abr + 1)
        file3 = 'S' + str(year_abr)
        file4 = 'S' + str(year_abr + 1)
        if str(year_abr) != '2010':
            vote_info_list = [file1, file2, file3, file4]
        else:
            vote_info_list = [file1, file3]
        for bill_vote_file in vote_info_list:
            s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % bill_vote_file
            s_vote_zip, resp = self.urlretrieve(s_vote_url)
            zipedfile = zipfile.ZipFile(s_vote_zip)
            vfile = "%s.txt" % bill_vote_file
            vote_file = zipedfile.open(vfile, 'U')
            vdict_file = csv.DictReader(vote_file)

            votes = {}
            if bill_vote_file[0] == "A":
                chamber = "lower"
            else:
                chamber = "upper"

            for rec in vdict_file:
                bill_id = rec["Bill"]
                bill_id = bill_id.strip()
                leg = rec["Full_Name"]

                date = rec["Session_Date"]
                date = datetime.strptime(date, "%m/%d/%Y")
                action = rec["Action"]
                leg_vote = rec["Legislator_Vote"]
                vote_id = bill_id + "_" + action
                vote_id = vote_id.replace(" ", "_")
                passed = None

                if vote_id not in votes:
                    votes[vote_id] = Vote(chamber, date, action, passed, None,
                                          None, None, bill_id=bill_id)
                if leg_vote == "Y":
                    votes[vote_id].yes(leg)
                elif leg_vote == "N":
                    votes[vote_id].no(leg)
                else:
                    votes[vote_id].other(leg)

            #Counts yes/no/other votes and saves overall vote
            for vote in votes.itervalues():
                vote_yes_count = len(vote["yes_votes"])
                vote_no_count = len(vote["no_votes"])
                vote_other_count = len(vote["other_votes"])
                vote["yes_count"] = vote_yes_count
                vote["no_count"] = vote_no_count
                vote["other_count"] = vote_other_count
                if vote_yes_count > vote_no_count:
                    vote["passed"] = True
                else:
                    vote["passed"] = False
                vote_bill_id = vote["bill_id"]
                bill = bill_dict[vote_bill_id]
                bill.add_vote(vote)

        #Actions
        bill_action_url, bill_action_db = self.get_dbf(year_abr, 'BILLHIST')


        for rec in bill_action_db:
            bill_type = rec["billtype"]
            bill_number = int(rec["billnumber"])
            bill_id = bill_type + str(bill_number)
            bill = bill_dict[bill_id]
            action = rec["action"]
            date = rec["dateaction"]
            actor = rec["house"]
            comment = rec["comment"]
            action, atype = self.categorize_action(action)
            if comment:
                action += (' ' + comment)
            bill.add_action(actor, action, date, type=atype)

        # Subjects
        subject_url, subject_db = self.get_dbf(year_abr, 'BILLSUBJ')
        for rec in subject_db:
            bill_id = rec['billtype'] + str(int(rec['billnumber']))
            bill = bill_dict.get(bill_id)
            if bill:
                bill.setdefault('subjects', []).append(rec['subjectkey'])
            else:
                self.warning('invalid bill id in BILLSUBJ.DBF: %s' % bill_id)

        # save all bills at the end
        for bill in bill_dict.itervalues():
            # add sources
            bill.add_source(bill_sponsors_url)
            bill.add_source(bill_document_url)
            bill.add_source(bill_action_url)
            bill.add_source(subject_url)
            self.save_bill(bill)
Ejemplo n.º 29
0
    def parse_bill_xml(self, chamber, session, txt):
        root = lxml.etree.fromstring(txt)
        bill_id = " ".join(root.attrib["bill"].split(" ")[1:])
        bill_title = root.findtext("caption")

        if session[2] == "R":
            session = session[0:2]

        if bill_id[1] == "B":
            bill_type = ["bill"]
        elif bill_id[1] == "R":
            bill_type = ["resolution"]
        elif bill_id[1:3] == "CR":
            bill_type = ["concurrent resolution"]
        elif bill_id[1:3] == "JR":
            bill_type = ["joint resolution"]
        else:
            raise ScrapeError("Invalid bill_id: %s" % bill_id)

        bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)

        for action in root.findall("actions/action"):
            act_date = dt.datetime.strptime(action.findtext("date"), "%m/%d/%Y").date()

            extra = {}
            extra["action_number"] = action.find("actionNumber").text
            comment = action.find("comment")
            if comment is not None and comment.text:
                extra["comment"] = comment.text.strip()

            actor = {"H": "lower", "S": "upper", "E": "executive"}[extra["action_number"][0]]

            desc = action.findtext("description").strip()

            if desc == "Amended":
                type = "amendment:passed"
            elif desc == "Amendment(s) offered":
                type = "amendment:introduced"
            elif desc == "Amendment amended":
                type = "amendment:amended"
            elif desc == "Amendment withdrawn":
                type = "amendment:withdrawn"
            elif desc.startswith("Received by the Secretary of"):
                type = "bill:introduced"
            elif desc == "Passed":
                type = "bill:passed"
            elif desc.startswith("Received from the"):
                type = "bill:introduced"
            elif desc.startswith("Signed by the Governor"):
                type = "governor:signed"
            elif desc == "Filed":
                type = "bill:introduced"
            else:
                type = "other"

            bill.add_action(actor, action.findtext("description"), act_date, type=type, **extra)

        for author in root.findtext("authors").split(" | "):
            if author != "":
                bill.add_sponsor("author", author)
        for coauthor in root.findtext("coauthors").split(" | "):
            if coauthor != "":
                bill.add_sponsor("coauthor", coauthor)
        for sponsor in root.findtext("sponsors").split(" | "):
            if sponsor != "":
                bill.add_sponsor("sponsor", sponsor)
        for cosponsor in root.findtext("cosponsors").split(" | "):
            if cosponsor != "":
                bill.add_sponsor("cosponsor", cosponsor)

        bill["subjects"] = []
        for subject in root.iterfind("subjects/subject"):
            bill["subjects"].append(subject.text.strip())

        return bill
Ejemplo n.º 30
0
    def get_bill_info(self, session, sub, bill_id):
        bill_detail_url = (
            "http://www.ncga.state.nc.us/gascripts/"
            "BillLookUp/BillLookUp.pl?bPrintable=true"
            "&Session=%s&BillID=%s&votesToView=all" % (session[0:4] + sub, bill_id)
        )

        # parse the bill data page, finding the latest html text
        if bill_id[0] == "H":
            chamber = "lower"
        else:
            chamber = "upper"

        bill_data = self.urlopen(bill_detail_url)
        bill_soup = self.soup_parser(bill_data)

        bill_title = bill_soup.findAll(
            "div", style="text-align: center; font: bold" " 20px Arial; margin-top: 15px;" " margin-bottom: 8px;"
        )[0].contents[0]

        bill = Bill(session + sub, chamber, bill_id, bill_title)
        bill.add_source(bill_detail_url)

        # get all versions
        links = bill_soup.findAll("a", href=re.compile("/Sessions/%s/Bills/\w+/HTML" % session[0:4]))

        for link in links:
            version_name = link.parent.previousSibling.previousSibling
            version_name = version_name.contents[0].replace("&nbsp;", " ")
            version_name = version_name.replace(u"\u00a0", " ")

            version_url = "http://www.ncga.state.nc.us" + link["href"]
            bill.add_version(version_name, version_url)

        # figure out which table has sponsor data
        sponsor_table = bill_soup.findAll("th", text="Sponsors", limit=1)[0].findParents("table", limit=1)[0]

        sponsor_rows = sponsor_table.findAll("tr")
        for leg in sponsor_rows[1].td.findAll("a"):
            bill.add_sponsor("primary", leg.contents[0].replace(u"\u00a0", " "))
        for leg in sponsor_rows[2].td.findAll("a"):
            bill.add_sponsor("cosponsor", leg.contents[0].replace(u"\u00a0", " "))

        action_table = bill_soup.findAll("th", text="Chamber", limit=1)[0].findParents("table", limit=1)[0]

        for row in action_table.findAll("tr"):
            cells = row.findAll("td")
            if len(cells) != 3:
                continue

            act_date, actor, action = map(lambda x: self.flatten(x), cells)
            act_date = dt.datetime.strptime(act_date, "%m/%d/%Y")

            if actor == "Senate":
                actor = "upper"
            elif actor == "House":
                actor = "lower"
            elif action.endswith("Gov."):
                actor = "Governor"

            bill.add_action(actor, action, act_date)

        for vote in bill_soup.findAll("a", href=re.compile("RollCallVoteTranscript")):
            self.get_vote(bill, vote["href"])

        self.save_bill(bill)