def scrape_actions(self, chamber, session, bill): """ Scrape the actions for a given bill """ ses_num = utils.legislature_to_number(session) bill_id = bill['bill_id'].replace(' ', '') action_url = BASE_URL + 'FormatDocument.asp?inDoc=/legtext/%s/bills/%so.asp' % (ses_num, bill_id.lower()) with self.urlopen(action_url) as action_page: bill.add_source(action_url) root = html.fromstring(action_page) base_table = root.xpath('//table[@class="ContentAreaBackground"]')[0] # common xpaths table_path = '//table[contains(tr/td/b/text(), "%s")]' #sponsors sponsors = base_table.xpath('//sponsor') for sponsor in sponsors: name = sponsor.text.strip() # sponsor.xpath('string(ancestor::td[1]/following-sibling::td[1]/text())').strip() s_type = sponsor.getparent().getparent().getnext().text_content().strip() bill.add_sponsor(s_type, name) #titles table = base_table.xpath(table_path % 'TITLE') if table: for row in table[0].iterchildren('tr'): title = row[1].text_content().strip() if title != bill['title']: bill.add_title(title) for table in base_table.xpath('tr/td/table'): action = table.xpath('string(tr[1]/td[1])').strip() if action == '': action = table.xpath('string(tr[1])').strip() if (action.endswith('FIRST READ:') or action.endswith('SECOND READ:') or 'WAIVED' in action): rows = table.xpath('tr') for row in rows: action = row[0].text_content().strip()[:-1] actor = 'lower' if action.startswith('H') else 'upper' date = utils.get_date(row[1]) # bill:introduced if (action.endswith('FIRST READ') or action.endswith('FIRST WAIVED')): if actor == chamber: a_type = ['bill:introduced', 'bill:reading:1'] else: a_type = 'bill:reading:1' bill.add_action(actor, action, date, type=a_type) else: a_type = 'bill:reading:2' bill.add_action(actor, action, date, type=a_type) continue elif action == 'COMMITTEES:': # committee assignments rows = table.xpath('tr')[1:] for row in rows: # First add the committee assigned action meta_tag = row.cssselect('meta')[0] h_or_s = meta_tag.get('name')[0] # @name is HCOMMITTEE OR SCOMMITTEE committee = meta_tag.get('content') # @content is committee abbrv #actor is house or senate referring the bill to committee actor = 'lower' if h_or_s.lower() == 'h' else 'upper' act = 'assigned to committee: ' + committee date = utils.get_date(row[1]) bill.add_action(actor, act, date, type='committee:referred') # now lets see if there is a vote vote_url = row[0].xpath('string(a/@href)') if vote_url: date = utils.get_date(row[3]) act = row[5].text_content().strip() a_type = get_action_type(act, 'COMMITTEES:') bill.add_action(actor, committee + ":" + act, date, type=a_type) self.scrape_votes(actor, vote_url, bill, date, motion='committee: ' + act, committee=committee, type='other') elif len(row) == 5: # probably senate rules committee date = utils.get_date(row[3]) if date == '': date = utils.get_date(row[1]) act = row[4].text_content().strip() a_type = get_action_type(act, 'COMMITTEES:') bill.add_action(actor, committee + ":" + act, date, type=a_type) continue elif 'CAUCUS' in action: rows = table.xpath('tr')[0:2] for row in rows: actor = utils.get_actor(row, chamber) action = row[0].text_content().strip() if action.endswith(':'): action = action[:-1] result = row[2].text_content().strip() # majority caucus Y|N action = action + " concur: " + result date = utils.get_date(row[1]) bill.add_action(actor, action, date, concur=result, type='other') continue # transmit to house or senate elif 'TRANSMIT TO' in action: rows = table.xpath('tr') for row in rows: action = row[0].text_content().strip()[:-1] actor = 'upper' if action.endswith('HOUSE') else 'lower' date = utils.get_date(row[1]) bill.add_action(actor, action, date, type='other') continue # Committee of the whole actions elif 'COW ACTION' in action: rows = table.xpath('tr') actor = utils.get_actor(rows[0], chamber) if 'SIT COW ACTION' in action: act = rows[0][-1].text_content().strip() date = utils.get_date(rows[0][1]) else: act = rows[1][2].text_content().strip() date = utils.get_date(rows[1][1]) action = action + " " + act # COW ACTION 1 DPA bill.add_action(actor, action, date, type='other') if rows[1][0].text_content().strip() == 'Vote Detail': vote_url = rows[1][0].xpath('string(a/@href)') self.scrape_votes(actor, vote_url, bill, date, motion=action, type='other', extra=act) continue # AMENDMENTS elif 'AMENDMENTS' in action: rows = table.xpath('tr')[1:] for row in rows: act = row.text_content().strip() if act == '': continue if 'passed' in act or 'adopted' in act: a_type = 'amendment:passed' elif 'failed' in act: a_type = 'amendment:failed' elif 'withdrawn' in act: a_type = 'amendment:withdrawn' else: a_type = 'other' # actor and date will same as previous action bill.add_action(actor, act, date, type=a_type) continue # CONFERENCE COMMITTEE # http://www.azleg.gov/FormatDocument.asp?inDoc=/legtext/49Leg/2r/bills/hb2083o.asp # MISCELLANEOUS MOTION # MOTION TO RECONSIDER elif action == 'MOTION TO RECONSIDER:': date = utils.get_date(table[1][1]) if date: if table[1][0].text_content().strip() == 'Vote Detail': vote_url = table[1][0].xpath('string(a/@href)') bill.add_action(actor, action, date, type=a_type) self.scrape_votes(actor, vote_url, bill, vote_date, motion='motion to reconsider', type='other') else: action = table[-1][1].text_content().strip() bill.add_action(actor, action, date, type='other') continue elif (action.endswith('FINAL READ:') or action.endswith('THIRD READ:')): # house|senate final and third read rows = table.xpath('tr') # need to find out if third read took place in house or senate # if an ancestor table contains 'TRANSMIT TO' then the action # is taking place in that chamber, else it is in chamber actor = utils.get_actor(rows[0], chamber) # get a dict of keys from the header and values from the row k_rows = utils.get_rows(rows[1:], rows[0]) action = rows[0][0].text_content().strip() for row in k_rows: a_type = [get_action_type(action, 'Generic')] if row[action].text_content().strip() == 'Vote Detail': vote_url = row.pop(action).xpath('string(a/@href)') vote_date = utils.get_date(row.pop('DATE')) passed = row.pop('RESULT').text_content().strip() # leaves vote counts, ammended, emergency, two-thirds # and possibly rfe left in k_rows. get the vote counts # from scrape votes and pass ammended and emergency # as kwargs to sort them in scrap_votes pass_fail = {'PASSED': 'bill:passed', 'FAILED': 'bill:failed'}[passed] a_type.append(pass_fail) bill.add_action(actor, action, vote_date, type=a_type) row['type'] = 'passage' self.scrape_votes(actor, vote_url, bill, vote_date, passed=passed, motion=action, **row) else: date = utils.get_date(row.pop('DATE')) if date: bill.add_action(actor, action, date, type=a_type) continue elif 'TRANSMITTED TO' in action: # transmitted to Governor or secretary of the state # SoS if it goes to voters as a proposition and memorials, etc rows = table.xpath('tr') actor = utils.get_actor(rows[0], chamber) # actor is the actor from the previous statement because it is # never transmitted to G or S without third or final read sent_to = rows[0][1].text_content().strip() date = utils.get_date(rows[0][2]) a_type = 'governor:received' if sent_to[0] == 'G' else 'other' bill.add_action(actor, "TRANSMITTED TO " + sent_to, date, type=a_type) # See if the actor is the governor and whether he signed # the bill or vetoed it act, date, chapter, version = '', '', '', '' for row in rows[1:]: if row[0].text_content().strip() == 'ACTION:': act = row[1].text_content().strip() date = utils.get_date(row[2]) elif row[0].text_content().strip() == 'CHAPTER:': chapter = row[1].text_content().strip() elif row[0].text_content().strip() == 'CHAPTERED VERSION:': version = row[1].text_content().strip() elif row[0].text_content().strip() == 'TRANSMITTED VERSION:': version = row[1].text_content().strip() if act and sent_to == 'GOVERNOR': a_type = 'governor:signed' if act == 'SIGNED' else 'governor:vetoed' if chapter: bill.add_action(sent_to.lower(), act, date, type=a_type, chapter=chapter, chaptered_version=version) else: bill.add_action(sent_to.lower(), act, date, type=a_type) continue # this is probably only important for historical legislation elif 'FINAL DISPOSITION' in action: rows = table.xpath('tr') if rows: disposition = rows[0][1].text_content().strip() bill['final_disposition'] = disposition bill = self.sort_bill_actions(bill) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id): session_id = self.get_session_id(session) url = base_url + 'DocumentsForBill.asp?Bill_Number=%s&Session_ID=%s' % ( bill_id, session_id) with self.urlopen(url) as docs_for_bill: root = html.fromstring(docs_for_bill) bill_title = root.xpath( '//div[@class="ContentPageTitle"]')[1].text.strip() # Depending on the progress the bill has made through the house # some table might not exist, the links that have javascript:Show**** # have a table with related documents/calanders/agendas/versions # I am skipping the sponsors link because that information is on the # bill overview page where all of the actions are found. doc_section_links = root.xpath( '//a[contains(@href, "javascript:Show")]') bill = Bill(session, chamber, bill_id, bill_title) bill.type = self.get_bill_type(bill_id[:-4]) bill.add_source(url) for link in doc_section_links: link_id = utils.parse_link_id(link) link_text = link.text_content().strip() div_path = '//div[@id="%s"]/table//tr' % link_id if link_text == 'Show Versions': # the first row has only a comment for tr in root.xpath(div_path)[1:]: tds = tr.cssselect('td') # list(tr.iterchildren('td')) if len(tds) >= 4: bill_version = tds[1].text_content().strip() bill_html = tds[2].xpath('string(font/a/@href)') bill_pdf = tds[3].xpath('string(font/a/@href)') bill.add_version(bill_version, bill_html, pdf_url=bill_pdf) elif link_text == 'Show Summaries/Fact Sheets': for tr in root.xpath(div_path)[1:]: # the first row has only a comment tds = tr.cssselect('td') if len(tds) > 1: fact_sheet = tds[1].text_content().strip() fact_sheet_url = tds[1].xpath( 'string(font/a/@href)') bill.add_document(fact_sheet, fact_sheet_url, type="fact sheet") elif link_text in ('Show Senate Agendas', 'Show House Agendas'): agenda_type = 'House Agenda' if re.match('House', link_text) else 'Senate Agenda' for tr in root.xpath(div_path)[2:]: # the first row has only a comment # the second row is the table header tds = tr.cssselect('td') if len(tds) >= 8: agenda_committee = tds[0].text_content().strip() agenda_revised = tds[1].text.strip() agenda_cancelled = tds[2].text.strip() agenda_date = tds[3].text_content().strip() agenda_time = tds[4].text_content().strip() agenda_room = tds[5].text_content().strip() agenda_pdf = tds[6].xpath('string(a/@href)').strip() agenda_html = tds[7].xpath('string(a/@href)').strip() bill.add_document(agenda_committee, agenda_html, type=agenda_type) elif link_text in ('Show Senate Calendars', 'Show House Calendar'): cal_type = 'house calendar' if re.match('House', link_text) else 'senate calendar' for tr in root.xpath(div_path)[2:]: # the first row has only a comment # the second row is the table header tds = tr.cssselect('td') if len(tds) >= 6: calendar_name = tds[0].text_content().strip() calendar_number = tds[1].text_content().strip() calendar_modified = True if tds[2].xpath('img') else False calendar_date = tds[3].text_content().strip() calendar_html = tds[5].xpath('string(a/@href)') bill.add_document(calendar_name, calendar_html, type="calendar") elif link_text == 'Show Adopted Amendments': for tr in root.xpath(div_path)[1:]: tds = tr.cssselect('td') amendment_title = tds[1].text_content().strip() amendment_link = tds[2].xpath('string(font/a/@href)') bill.add_document(amendment_title, amendment_link, type='amendment') elif link_text == 'Show Proposed Amendments': for tr in root.xpath(div_path)[1:]: tds = tr.cssselect('td') if len(tds) >= 3: amendment_title = tds[1].text_content().strip() amendment_link = tds[2].xpath('string(font/a/@href)') bill.add_document(amendment_title, amendment_link, type='amendment') elif link_text == 'Show Bill Videos': for tr in root.xpath(div_path)[2:]: tds = tr.cssselect('td') if len(tds) >= 3: video_title = tds[1].text_content().strip() video_link = tds[2].xpath('string(a/@href)') video_date = tds[0].text_content().strip() bill.add_document(video_title, video_link, date=video_date, type='video') # action_url = 'http://www.azleg.gov/FormatDocument.asp?inDoc=/legtext/49leg/2r/bills/hb2001o.asp' # again the actions page may or may not have a given table and the order # of the actions depends on the chamber the bill originated in. ses_num = utils.legislature_to_number(session) action_url = base_url + 'FormatDocument.asp?inDoc=/legtext/%s/bills/%so.asp' % (ses_num, bill_id.lower()) with self.urlopen(action_url) as action_page: bill.add_source(action_url) root = html.fromstring(action_page) action_tables = root.xpath('/html/body/div/table/tr[3]/td[4]/table/tr/td/table/tr/td/table') for table in action_tables: rows = table.cssselect('tr') house = False if chamber == 'upper' else True action = table.cssselect('td')[0].text_content().strip()[:-1] if action == 'SPONSORS': if len(rows[0]) == 4: for row in rows: tds = row.cssselect('td') sponsors = [tds[i:i+2:] for i in range(1, len(tds), 2)] bill.add_sponsor(sponsors[0][1].text_content().strip(), sponsors[0][0].text_content().strip(), sponsor_link=sponsors[0][0].xpath('string(a/@href)')) elif action == 'COMMITTEES': # the html for this table has meta tags that give the chamber # and the committee abreviation # <meta name="HCOMMITTEE" content="RULES"> # question for actions: in the case of committees would House # Rules be better for an actor? for row in rows[1:]: tds = row.cssselect('td') meta_tag = row.cssselect('meta')[0] actor = "%s:%s" % (meta_tag.get('name'), meta_tag.get('content')) committee = meta_tag.get('content') act = 'committee:reffered' date = datetime.datetime.strptime(tds[1].text_content().strip(), '%m/%d/%y') bill.add_action(actor, act, date, type='committee:referred') if len(tds) == 5: if re.match('\d{2}/\d{2}/\d{2}', tds[3].text_content().strip()): date = datetime.datetime.strptime(tds[3].text_content().strip(), '%m/%d/%y') else: date = datetime.datetime.strptime(tds[1].text_content().strip(), '%m/%d/%y') act = tds[4].text_content().strip() status = 'other' bill.add_action(actor, act, date, type=status, status=status) elif len(tds) == 6: where, committee = actor.split(':') where = 'lower' if where == 'HCOMMITTEE' else 'upper' date = datetime.datetime.strptime(tds[3].text_content().strip(), '%m/%d/%y') vote = tds[4].text_content().strip()[1:-1] if len(vote.split('-')) == 4: yes, no, nv, exc = vote.split('-') else: yes, no, excused, absent, nv = vote.split('-') motion = tds[5].text_content().strip() passed = True if yes > no else False vote = Vote(where, date, motion, passed, int(yes), int(no), int(nv), committee=committee) vote.add_source(tds[0].xpath('string(a/@href)').strip()) bill.add_vote(vote) elif action in ('HOUSE FIRST READ', 'HOUSE SECOND READ'): aType = 'other' if re.search('HOUSE FIRST', action): aType = 'committee:referred' bill.add_action('lower', action, utils.get_date(rows[0][1]), type=aType) elif action in ('SENATE FIRST READ', 'SENATE SECOND READ'): aType = 'other' if re.search('SECOND', action): aType = 'committee:referred' bill.add_action('upper', action, utils.get_date(rows[0][1]), type=aType) elif action in ('TRANSMIT TO HOUSE', 'TRANSMIT TO SENATE'): actor = 'lower' if re.match('HOUSE', action) else 'upper' house = True if actor == 'lower' else False date = utils.get_date(rows[0][1]) bill.add_action(actor, action, date) elif re.match('COW ACTION \d', action): actor = 'lower' if house else 'upper' for row in rows[1:]: date = utils.get_date(row[1]) bill.add_action(actor, action, date, motion=row[2].text_content().strip()) elif action in ('HOUSE FINAL READ', 'SENATE FINAL READ', 'THIRD READ'): actor = 'lower' if house else 'upper' for row in rows[1:]: if row[0].text_content().strip() == 'Vote Detail': if len(row.getchildren()) == 10: detail, date, ayes, nays, nv, exc, emer, rfe, two_thirds, result = [ x.text_content().strip() for x in row ] print action_url passed = True if result == 'PASSED' else False motion = action date = datetime.datetime.strptime(date, '%m/%d/%y') if date else '' vote = Vote(actor, date, motion, passed, int(ayes), int(nays), int(nv), excused=int(exc), emergency=emer, rfe=rfe, two_thirds_vote=two_thirds, type="passage") vote.add_source(row[0].xpath('string(a/@href)').strip()) bill.add_vote(vote) elif len(row.getchildren()) == 11: detail, date, ayes, nays, nv, exc, emer, amend, rfe, two_thirds, result = [ x.text_content().strip() for x in row ] passed = True if result == 'PASSED' else False motion = action date = datetime.datetime.strptime(date, '%m/%d/%y') if date else '' vote = Vote(actor, date, motion, passed, int(ayes), int(nays), int(nv), excused=int(exc), emergency=emer, amended=amend, rfe=rfe, two_thirds_vote=two_thirds, type="passage") vote.add_source(row[0].xpath('string(a/@href)').strip()) bill.add_vote(vote) elif action == 'TRANSMITTED TO': actor = 'lower' if house else 'upper' act = action + ": " + rows[0][1].text_content().strip() date = rows[0][2].text_content().strip() date = datetime.datetime.strptime(date, '%m/%d/%y') bill.add_action(actor, act, date, type='governor:received') # need action and chaptered, chaptered version if they exists act, date, chapter, version = '', '', '', '' for row in rows[1:]: if row[0].text_content().strip() == 'ACTION:': act = row[1].text_content().strip() date = datetime.datetime.strptime(row[2].text_content().strip(), '%m/%d/%y') elif row[0].text_content().strip() == 'CHAPTER': chapter = row[1].text_content().strip() elif row[0].text_content().strip() == 'CHAPTERED VERSION': version = row[1].text_content.strip() if act: action_type = 'governor:signed' if act == 'SIGNED' else 'governor:vetoed' if chapter: bill.add_action('governor', act, date, type=action_type, chapter=chapter, chaptered_version=version) else: bill.add_action('governor', act, date, type=action_type) self.save_bill(bill) self.log("saved: " + bill['bill_id'])