Example #1
0
 def scrape_actions(self, chamber, session, bill):
     """
     Scrape the actions for a given bill
     """
     ses_num = utils.legislature_to_number(session)
     bill_id = bill['bill_id'].replace(' ', '')
     action_url = BASE_URL + 'FormatDocument.asp?inDoc=/legtext/%s/bills/%so.asp' % (ses_num, bill_id.lower())
     with self.urlopen(action_url) as action_page:
         bill.add_source(action_url)
         root = html.fromstring(action_page)
         base_table = root.xpath('//table[@class="ContentAreaBackground"]')[0]
         # common xpaths
         table_path = '//table[contains(tr/td/b/text(), "%s")]'
         
         #sponsors
         sponsors = base_table.xpath('//sponsor')
         for sponsor in sponsors:
             name = sponsor.text.strip()
             # sponsor.xpath('string(ancestor::td[1]/following-sibling::td[1]/text())').strip()
             s_type = sponsor.getparent().getparent().getnext().text_content().strip()
             bill.add_sponsor(s_type, name)
             
         #titles
         table = base_table.xpath(table_path % 'TITLE')
         if table:
             for row in table[0].iterchildren('tr'):
                 title = row[1].text_content().strip()
                 if title != bill['title']:
                     bill.add_title(title)
         
         for table in base_table.xpath('tr/td/table'):
             action = table.xpath('string(tr[1]/td[1])').strip()
             if action == '':
                 action = table.xpath('string(tr[1])').strip()
             if (action.endswith('FIRST READ:') or 
                 action.endswith('SECOND READ:') or 'WAIVED' in action):
                 
                 rows = table.xpath('tr')
                 for row in rows:
                     action = row[0].text_content().strip()[:-1]
                     actor = 'lower' if action.startswith('H') else 'upper'
                     date = utils.get_date(row[1])
                     # bill:introduced
                     if (action.endswith('FIRST READ') or 
                         action.endswith('FIRST WAIVED')):
                         if actor == chamber:
                             a_type = ['bill:introduced', 'bill:reading:1']
                         else:
                             a_type = 'bill:reading:1'
                         bill.add_action(actor, action, date, type=a_type) 
                     else:
                         a_type = 'bill:reading:2'
                         bill.add_action(actor, action, date, type=a_type)
                 continue
                         
             elif action == 'COMMITTEES:':
                 # committee assignments
                 rows = table.xpath('tr')[1:]
                 for row in rows:
                     # First add the committee assigned action
                     meta_tag = row.cssselect('meta')[0]
                     h_or_s = meta_tag.get('name')[0] # @name is HCOMMITTEE OR SCOMMITTEE
                     committee = meta_tag.get('content') # @content is committee abbrv
                     #actor is house or senate referring the bill to committee
                     actor = 'lower' if h_or_s.lower() == 'h' else 'upper'
                     act = 'assigned to committee: ' + committee
                     date = utils.get_date(row[1])
                     bill.add_action(actor, act, date, type='committee:referred')
                     # now lets see if there is a vote
                     vote_url = row[0].xpath('string(a/@href)')
                     if vote_url:
                         date = utils.get_date(row[3])
                         act = row[5].text_content().strip()
                         a_type = get_action_type(act, 'COMMITTEES:')
                         bill.add_action(actor, committee + ":" + act, date, 
                                         type=a_type)
                         self.scrape_votes(actor, vote_url, bill, date,
                                             motion='committee: ' + act, 
                                             committee=committee, 
                                             type='other')
                     elif len(row) == 5:
                         # probably senate rules committee
                         date = utils.get_date(row[3])
                         if date == '':
                             date = utils.get_date(row[1])
                         act = row[4].text_content().strip()
                         a_type = get_action_type(act, 'COMMITTEES:')
                         bill.add_action(actor, committee + ":" + act, date, 
                                         type=a_type)
                 continue
                 
             elif 'CAUCUS' in action:
                 rows = table.xpath('tr')[0:2]
                 for row in rows:
                     actor = utils.get_actor(row, chamber)
                     action = row[0].text_content().strip()
                     if action.endswith(':'):
                         action = action[:-1]
                     result = row[2].text_content().strip()
                     # majority caucus Y|N
                     action = action + " concur: " + result 
                     date = utils.get_date(row[1])
                     bill.add_action(actor, action, date, concur=result,
                                     type='other')
                 continue
             
         # transmit to house or senate
             elif 'TRANSMIT TO' in action:
                 rows = table.xpath('tr')
                 for row in rows:
                     action = row[0].text_content().strip()[:-1]
                     actor = 'upper' if action.endswith('HOUSE') else 'lower'
                     date = utils.get_date(row[1])
                     bill.add_action(actor, action, date, type='other')
                 continue
             
             # Committee of the whole actions
             elif 'COW ACTION' in action:
                 rows = table.xpath('tr')
                 actor = utils.get_actor(rows[0], chamber)
                 if 'SIT COW ACTION' in action: 
                     act = rows[0][-1].text_content().strip()
                     date = utils.get_date(rows[0][1])
                 else:
                     act = rows[1][2].text_content().strip()
                     date = utils.get_date(rows[1][1])
                 action = action + " " + act # COW ACTION 1 DPA
                 bill.add_action(actor, action, date, type='other')
                 if rows[1][0].text_content().strip() == 'Vote Detail':
                     vote_url = rows[1][0].xpath('string(a/@href)')
                     self.scrape_votes(actor, vote_url, bill, date, 
                                             motion=action, type='other', 
                                             extra=act)
                 continue
             # AMENDMENTS
             elif 'AMENDMENTS' in action:
                 rows = table.xpath('tr')[1:]
                 for row in rows:
                     act = row.text_content().strip()
                     if act == '':
                         continue
                     if 'passed' in act or 'adopted' in act:
                         a_type = 'amendment:passed'
                     elif 'failed' in act:
                         a_type = 'amendment:failed'
                     elif 'withdrawn' in act:
                         a_type = 'amendment:withdrawn'
                     else:
                         a_type = 'other'
                     # actor and date will same as previous action
                     bill.add_action(actor, act, date, type=a_type)
                 continue
         # CONFERENCE COMMITTEE
         # http://www.azleg.gov/FormatDocument.asp?inDoc=/legtext/49Leg/2r/bills/hb2083o.asp
         
             # MISCELLANEOUS MOTION
             
             # MOTION TO RECONSIDER
             elif action == 'MOTION TO RECONSIDER:':
                 date = utils.get_date(table[1][1])
                 if date:
                     if table[1][0].text_content().strip() == 'Vote Detail':
                         vote_url = table[1][0].xpath('string(a/@href)')
                         bill.add_action(actor, action, date, type=a_type)
                         self.scrape_votes(actor, vote_url, bill, vote_date,
                                           motion='motion to reconsider', 
                                             type='other')
                     else:
                         action = table[-1][1].text_content().strip()
                         bill.add_action(actor, action, date, type='other')
                 continue
                 
             elif (action.endswith('FINAL READ:') or 
                   action.endswith('THIRD READ:')):
                 # house|senate final and third read
                 rows = table.xpath('tr')
                 # need to find out if third read took place in house or senate
                 # if an ancestor table contains 'TRANSMIT TO' then the action
                 # is taking place in that chamber, else it is in chamber
                 actor = utils.get_actor(rows[0], chamber)
                 # get a dict of keys from the header and values from the row
                 k_rows = utils.get_rows(rows[1:], rows[0])
                 action = rows[0][0].text_content().strip()
                 for row in k_rows:
                     a_type = [get_action_type(action, 'Generic')]
                     if row[action].text_content().strip() == 'Vote Detail':
                         vote_url = row.pop(action).xpath('string(a/@href)')
                         vote_date = utils.get_date(row.pop('DATE'))
                         passed = row.pop('RESULT').text_content().strip()
                         # leaves vote counts, ammended, emergency, two-thirds
                         # and possibly rfe left in k_rows. get the vote counts 
                         # from scrape votes and pass ammended and emergency
                         # as kwargs to sort them in scrap_votes
                         pass_fail = {'PASSED': 'bill:passed',
                                         'FAILED': 'bill:failed'}[passed]
                         a_type.append(pass_fail)
                         bill.add_action(actor, action, vote_date, 
                                         type=a_type)
                         row['type'] = 'passage'
                         self.scrape_votes(actor, vote_url, bill, vote_date,
                                             passed=passed, motion=action, 
                                             **row)
                     else:
                         date = utils.get_date(row.pop('DATE'))
                         if date:
                             bill.add_action(actor, action, date, type=a_type)
                 continue
             elif 'TRANSMITTED TO' in action:
                 # transmitted to Governor or secretary of the state
                 # SoS if it goes to voters as a proposition and memorials, etc
                 rows = table.xpath('tr')
                 actor = utils.get_actor(rows[0], chamber)
                 # actor is the actor from the previous statement because it is 
                 # never transmitted to G or S without third or final read
                 sent_to = rows[0][1].text_content().strip()
                 date = utils.get_date(rows[0][2])
                 a_type = 'governor:received' if sent_to[0] == 'G' else 'other'
                 bill.add_action(actor, "TRANSMITTED TO " + sent_to, date, 
                                 type=a_type)
                 # See if the actor is the governor and whether he signed
                 # the bill or vetoed it
                 act, date, chapter, version = '', '', '', ''
                 for row in rows[1:]:
                     if row[0].text_content().strip() == 'ACTION:':
                         act = row[1].text_content().strip()
                         date = utils.get_date(row[2])
                     elif row[0].text_content().strip() == 'CHAPTER:':
                         chapter = row[1].text_content().strip()
                     elif row[0].text_content().strip() == 'CHAPTERED VERSION:':
                         version = row[1].text_content().strip()
                     elif row[0].text_content().strip() == 'TRANSMITTED VERSION:':
                         version = row[1].text_content().strip()
                 if act and sent_to == 'GOVERNOR':
                     a_type = 'governor:signed' if act == 'SIGNED' else 'governor:vetoed'
                     if chapter:
                         bill.add_action(sent_to.lower(), act, date, 
                                         type=a_type, chapter=chapter, 
                                         chaptered_version=version)
                     else:
                         bill.add_action(sent_to.lower(), act, date, 
                                             type=a_type)
                 continue
                                         
         # this is probably only important for historical legislation
             elif 'FINAL DISPOSITION' in action:
                 rows = table.xpath('tr')
                 if rows:
                     disposition = rows[0][1].text_content().strip()
                     bill['final_disposition'] = disposition
     bill = self.sort_bill_actions(bill)
     self.save_bill(bill)
Example #2
0
    def scrape_bill(self, chamber, session, bill_id):
        session_id = self.get_session_id(session)
        url = base_url + 'DocumentsForBill.asp?Bill_Number=%s&Session_ID=%s' % (
                                                            bill_id, session_id)
        with self.urlopen(url) as docs_for_bill:
            root = html.fromstring(docs_for_bill)
            bill_title = root.xpath(
                            '//div[@class="ContentPageTitle"]')[1].text.strip()
            # Depending on the progress the bill has made through the house
            # some table might not exist, the links that have javascript:Show****
            # have a table with related documents/calanders/agendas/versions
            # I am skipping the sponsors link because that information is on the
            # bill overview page where all of the actions are found.
            doc_section_links = root.xpath(
                                    '//a[contains(@href, "javascript:Show")]')
            bill = Bill(session, chamber, bill_id, bill_title)
            bill.type = self.get_bill_type(bill_id[:-4])
            bill.add_source(url)
            for link in doc_section_links:
                link_id = utils.parse_link_id(link)
                link_text = link.text_content().strip()
                div_path = '//div[@id="%s"]/table//tr' % link_id
                if link_text == 'Show Versions':
                    # the first row has only a comment
                    for tr in root.xpath(div_path)[1:]:
                        tds = tr.cssselect('td') # list(tr.iterchildren('td'))
                        if len(tds) >= 4:
                            bill_version = tds[1].text_content().strip()
                            bill_html = tds[2].xpath('string(font/a/@href)')
                            bill_pdf = tds[3].xpath('string(font/a/@href)')
                            bill.add_version(bill_version, 
                                                    bill_html, pdf_url=bill_pdf)
                elif link_text == 'Show Summaries/Fact Sheets':
                    for tr in root.xpath(div_path)[1:]:
                        # the first row has only a comment
                        tds = tr.cssselect('td')
                        if len(tds) > 1:
                            fact_sheet = tds[1].text_content().strip()
                            fact_sheet_url = tds[1].xpath(
                                                        'string(font/a/@href)')
                            bill.add_document(fact_sheet,
                                             fact_sheet_url, type="fact sheet")
                elif link_text in ('Show Senate Agendas', 'Show House Agendas'):
                    agenda_type = 'House Agenda' if re.match('House', link_text) else 'Senate Agenda'
                    for tr in root.xpath(div_path)[2:]:
                        # the first row has only a comment
                        # the second row is the table header
                        tds = tr.cssselect('td')
                        if len(tds) >= 8:
                            agenda_committee = tds[0].text_content().strip()
                            agenda_revised = tds[1].text.strip()
                            agenda_cancelled = tds[2].text.strip()
                            agenda_date = tds[3].text_content().strip()
                            agenda_time = tds[4].text_content().strip()
                            agenda_room = tds[5].text_content().strip()
                            agenda_pdf = tds[6].xpath('string(a/@href)').strip()
                            agenda_html = tds[7].xpath('string(a/@href)').strip()
                            bill.add_document(agenda_committee, 
                                                agenda_html, type=agenda_type)
                elif link_text in ('Show Senate Calendars',
                                    'Show House Calendar'):
                    cal_type = 'house calendar' if re.match('House', link_text) else 'senate calendar'
                    for tr in root.xpath(div_path)[2:]:
                        # the first row has only a comment
                        # the second row is the table header
                        tds = tr.cssselect('td')
                        if len(tds) >= 6:
                            calendar_name = tds[0].text_content().strip()
                            calendar_number = tds[1].text_content().strip()
                            calendar_modified = True if tds[2].xpath('img') else False 
                            calendar_date = tds[3].text_content().strip()
                            calendar_html = tds[5].xpath('string(a/@href)')
                            bill.add_document(calendar_name, 
                                                calendar_html, type="calendar")
                elif link_text == 'Show Adopted Amendments':
                    for tr in root.xpath(div_path)[1:]:
                        tds = tr.cssselect('td')
                        amendment_title = tds[1].text_content().strip()
                        amendment_link = tds[2].xpath('string(font/a/@href)')
                        bill.add_document(amendment_title, 
                                            amendment_link, type='amendment')        
                elif link_text == 'Show Proposed Amendments':
                    for tr in root.xpath(div_path)[1:]:
                        tds = tr.cssselect('td')
                        if len(tds) >= 3: 
                            amendment_title = tds[1].text_content().strip()
                            amendment_link = tds[2].xpath('string(font/a/@href)')
                            bill.add_document(amendment_title,
                                               amendment_link, type='amendment')        
                elif link_text == 'Show Bill Videos':
                    for tr in root.xpath(div_path)[2:]:
                        tds = tr.cssselect('td')
                        if len(tds) >= 3:
                            video_title = tds[1].text_content().strip()
                            video_link = tds[2].xpath('string(a/@href)')
                            video_date = tds[0].text_content().strip()
                            bill.add_document(video_title, video_link, 
                                                date=video_date, type='video')

        # action_url = 'http://www.azleg.gov/FormatDocument.asp?inDoc=/legtext/49leg/2r/bills/hb2001o.asp'
        # again the actions page may or may not have a given table and the order
        # of the actions depends on the chamber the bill originated in. 
        ses_num = utils.legislature_to_number(session)
        action_url = base_url + 'FormatDocument.asp?inDoc=/legtext/%s/bills/%so.asp' % (ses_num, bill_id.lower())
        with self.urlopen(action_url) as action_page:
            bill.add_source(action_url)
            root = html.fromstring(action_page)
            action_tables = root.xpath('/html/body/div/table/tr[3]/td[4]/table/tr/td/table/tr/td/table')
            for table in action_tables:
                rows = table.cssselect('tr')
                house = False if chamber == 'upper' else True
                action = table.cssselect('td')[0].text_content().strip()[:-1]
                if action == 'SPONSORS':
                    if len(rows[0]) == 4:
                        for row in rows:
                            tds = row.cssselect('td')
                            sponsors = [tds[i:i+2:] for i in range(1, len(tds), 2)]
                            bill.add_sponsor(sponsors[0][1].text_content().strip(), 
                                             sponsors[0][0].text_content().strip(),
                                             sponsor_link=sponsors[0][0].xpath('string(a/@href)'))
                elif action == 'COMMITTEES':
                    # the html for this table has meta tags that give the chamber
                    # and the committee abreviation
                    # <meta name="HCOMMITTEE" content="RULES">
                    # question for actions: in the case of committees would House
                    # Rules be better for an actor? 
                    for row in rows[1:]:
                        tds = row.cssselect('td')
                        meta_tag = row.cssselect('meta')[0]
                        actor = "%s:%s" % (meta_tag.get('name'), meta_tag.get('content'))
                        committee = meta_tag.get('content')
                        act = 'committee:reffered'
                        date = datetime.datetime.strptime(tds[1].text_content().strip(), '%m/%d/%y')
                        bill.add_action(actor, act, date, type='committee:referred')
                        if len(tds) == 5:
                            if re.match('\d{2}/\d{2}/\d{2}', tds[3].text_content().strip()):
                                date = datetime.datetime.strptime(tds[3].text_content().strip(), '%m/%d/%y')
                            else:
                                date = datetime.datetime.strptime(tds[1].text_content().strip(), '%m/%d/%y')
                            act = tds[4].text_content().strip()
                            status = 'other'
                            bill.add_action(actor, act, date, type=status, status=status)
                        elif len(tds) == 6:
                            where, committee = actor.split(':')
                            where = 'lower' if where == 'HCOMMITTEE' else 'upper'
                            date = datetime.datetime.strptime(tds[3].text_content().strip(), '%m/%d/%y')
                            vote = tds[4].text_content().strip()[1:-1]
                            if len(vote.split('-')) == 4:
                                yes, no, nv, exc = vote.split('-')
                            else:
                                yes, no, excused, absent, nv = vote.split('-')
                            motion = tds[5].text_content().strip()
                            passed = True if yes > no else False
                            vote = Vote(where, date, motion, passed, int(yes), int(no), int(nv), committee=committee)
                            vote.add_source(tds[0].xpath('string(a/@href)').strip())
                            
                            bill.add_vote(vote)
                elif action in ('HOUSE FIRST READ', 'HOUSE SECOND READ'):
                    aType = 'other'
                    if re.search('HOUSE FIRST', action):
                        aType = 'committee:referred'
                    bill.add_action('lower', action, utils.get_date(rows[0][1]),
                                     type=aType)
                elif action in ('SENATE FIRST READ', 'SENATE SECOND READ'):
                    aType = 'other'
                    if re.search('SECOND', action):
                        aType = 'committee:referred'
                    bill.add_action('upper', action, utils.get_date(rows[0][1]),
                                     type=aType)
                elif action in ('TRANSMIT TO HOUSE', 'TRANSMIT TO SENATE'):
                    actor = 'lower' if re.match('HOUSE', action) else 'upper'
                    house = True if actor == 'lower' else False
                    date = utils.get_date(rows[0][1])
                    bill.add_action(actor, action, date)
                elif re.match('COW ACTION \d', action):
                    actor = 'lower' if house else 'upper'
                    for row in rows[1:]:
                        date = utils.get_date(row[1])
                        bill.add_action(actor, action, date, motion=row[2].text_content().strip())
                elif action in ('HOUSE FINAL READ', 'SENATE FINAL READ', 'THIRD READ'):
                    actor = 'lower' if house else 'upper'
                    for row in rows[1:]:
                        if row[0].text_content().strip() == 'Vote Detail':
                            if len(row.getchildren()) == 10:
                                detail, date, ayes, nays, nv, exc, emer, rfe, two_thirds, result = [ x.text_content().strip() for x in row ]
                                print action_url
                                passed = True if result == 'PASSED' else False
                                motion = action
                                date = datetime.datetime.strptime(date, '%m/%d/%y') if date else ''
                                vote = Vote(actor, date, motion, passed, int(ayes), int(nays), int(nv),
                                             excused=int(exc), emergency=emer,  rfe=rfe, 
                                             two_thirds_vote=two_thirds, type="passage")
                                vote.add_source(row[0].xpath('string(a/@href)').strip())
                                bill.add_vote(vote)
                            elif len(row.getchildren()) == 11:
                                detail, date, ayes, nays, nv, exc, emer, amend, rfe, two_thirds, result = [ x.text_content().strip() for x in row ]
                                passed = True if result == 'PASSED' else False
                                motion = action
                                date = datetime.datetime.strptime(date, '%m/%d/%y') if date else ''
                                vote = Vote(actor, date, motion, passed, int(ayes), int(nays), int(nv),
                                             excused=int(exc), emergency=emer, amended=amend,
                                              rfe=rfe, two_thirds_vote=two_thirds, type="passage")
                                vote.add_source(row[0].xpath('string(a/@href)').strip())
                                bill.add_vote(vote)
                        
                elif action == 'TRANSMITTED TO':
                    actor = 'lower' if house else 'upper'
                    act = action + ": " + rows[0][1].text_content().strip()
                    date = rows[0][2].text_content().strip()
                    date = datetime.datetime.strptime(date, '%m/%d/%y')
                    bill.add_action(actor, act, date, type='governor:received')
                    # need action and chaptered, chaptered version if they exists
                    act, date, chapter, version = '', '', '', ''
                    for row in rows[1:]:
                        if row[0].text_content().strip() == 'ACTION:':
                            act = row[1].text_content().strip()
                            date = datetime.datetime.strptime(row[2].text_content().strip(), '%m/%d/%y')
                        elif row[0].text_content().strip() == 'CHAPTER':
                            chapter = row[1].text_content().strip()
                        elif row[0].text_content().strip() == 'CHAPTERED VERSION':
                            version = row[1].text_content.strip()
                    if act:
                        action_type = 'governor:signed' if act == 'SIGNED' else 'governor:vetoed'
                        if chapter:
                            bill.add_action('governor', act, date, 
                                            type=action_type, chapter=chapter, 
                                            chaptered_version=version)
                        else:
                            bill.add_action('governor', act, date, 
                                                type=action_type)
        self.save_bill(bill)
        self.log("saved: " + bill['bill_id'])