def scrape_current(self, chamber, term): chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? with self.urlopen(ksapi.url + 'bill_status/') as bill_request: bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' # main bill = Bill(term, chamber, bill_id, bill_data['SHORTTITLE'], type=btype, status=bill_data['STATUS']) bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if bill_data['LONGTITLE']: bill.add_title(bill_data['LONGTITLE']) for sponsor in bill_data['SPONSOR_NAMES']: stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1 else 'cosponsor') bill.add_sponsor(stype, sponsor) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S") # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning('unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = 'other' else: atype = ksapi.action_codes[event['action_code']] bill.add_action(actor, action, date, type=atype) self.scrape_html(bill) self.save_bill(bill)
def process_bill(self, data): chamber = parse_psuedo_id(data['from_organization'])['classification'] if chamber == 'legislature': chamber = 'upper' bill = Bill(data['legislative_session'], chamber, data['identifier'], data['title'], subjects=data['subject'], type=data['classification']) if data['abstracts']: bill['summary'] = data['abstracts'][0]['abstract'] bill.update(**data['extras']) for action in data['actions']: actor = parse_psuedo_id(action['organization_id'])['classification'] legislators = [] committees = [] for rel in action['related_entities']: if rel['entity_type'] == 'organization': committees.append(rel['name']) elif rel['entity_type'] == 'person': legislators.append(rel['name']) bill.add_action(actor, action['description'], parse_date(action['date']), type=_action_categories(action['classification']), committees=committees, legislators=legislators, **action.get('extras', {}), ) for source in data['sources']: bill.add_source(source['url']) for sponsor in data['sponsorships']: bill.add_sponsor(sponsor['classification'], sponsor['name'], ) for version in data['versions']: for link in version['links']: bill.add_version(version['note'], link['url'], mimetype=link['media_type'], date=parse_date(version['date']), **version.get('extras', {})) for doc in data['documents']: for link in doc['links']: bill.add_document(doc['note'], link['url'], mimetype=link['media_type'], date=parse_date(doc['date']), **doc.get('extras', {})) for title in data['other_titles']: bill.add_title(title['title']) for related in data['related_bills']: bill.add_companion(related['identifier'], related['legislative_session'], chamber ) self.save_bill(bill)
def scrape_current(self, chamber, term): chamber_name = "Senate" if chamber == "upper" else "House" with self.urlopen( ksapi.url + "bill_status/" ) as bill_request: # perhaps we should save this data so we can make on request for both chambers? bill_request_json = json.loads(bill_request) bills = bill_request_json["content"] for bill_data in bills: # filtering out other chambers bill_equal_chamber = False for history in bill_data["HISTORY"]: if history["chamber"] == chamber_name: bill_is_in_chamber = True if not bill_is_in_chamber: continue # main bill = Bill(term, chamber, bill_data["BILLNO"], bill_data["SHORTTITLE"]) bill.add_source(ksapi.url + "bill_status/" + bill_data["BILLNO"].lower()) if bill_data["LONGTITLE"]: bill.add_title(bill_data["LONGTITLE"]) bill.add_document("apn", ksapi.ksleg + bill_data["apn"]) bill.add_version("Latest", ksapi.ksleg + bill_data["apn"]) for sponsor in bill_data["SPONSOR_NAMES"]: bill.add_sponsor("primary" if len(bill_data["SPONSOR_NAMES"]) == 1 else "cosponsor", sponsor) for event in bill_data["HISTORY"]: if "committee_names" in event and "conferee_names" in event: actor = " and ".join(bill_data["committee_names"] + bill_data["conferee_names"]) elif "committee_names" in history: actor = " and ".join(bill_data["committee_names"]) elif "conferee_names" in history: actor = " and ".join(bill_data["conferee_names"]) else: actor = "upper" if chamber == "Senate" else "lower" date = datetime.datetime.strptime(event["occurred_datetime"], "%Y-%m-%dT%H:%M:%S") bill.add_action(actor, event["status"], date) if event["action_code"] in ksapi.voted: votes = votes_re.match(event["status"]) if votes: vote = Vote( chamber, date, votes.group(1), event["action_code"] in ksapi.passed, int(votes.group(2)), int(votes.group(3)), 0, ) vote.add_source(ksapi.ksleg + "bill_status/" + bill_data["BILLNO"].lower()) bill.add_vote(vote) self.save_bill(bill)
def process_bill(self, data): chamber = parse_psuedo_id(data['from_organization'])['classification'] bill = Bill(data['legislative_session'], chamber, data['identifier'], data['title'], subjects=data['subject'], type=data['classification']) if data['abstracts']: bill['summary'] = data['abstracts'][0]['abstract'] bill.update(**data['extras']) for action in data['actions']: actor = parse_psuedo_id( action['organization_id'])['classification'] bill.add_action(actor, action['description'], parse_date(action['date']), type=_action_categories(action['classification'])) # TODO: related entities for source in data['sources']: bill.add_source(source['url']) for sponsor in data['sponsorships']: bill.add_sponsor( sponsor['classification'], sponsor['name'], ) for version in data['versions']: for link in version['links']: bill.add_version(version['note'], link['url'], mimetype=link['media_type'], date=parse_date(version['date'])) for doc in data['documents']: for link in doc['links']: bill.add_document(doc['note'], link['url'], mimetype=link['media_type'], date=parse_date(doc['date'])) for title in data['other_titles']: bill.add_title(title) # TODO: related bills # for related in data['related_bills']: self.save_bill(bill)
def process_bill(self, data): chamber = parse_psuedo_id(data['from_organization'])['classification'] bill = Bill(data['legislative_session'], chamber, data['identifier'], data['title'], subjects=data['subject'], type=data['classification']) if data['abstracts']: bill['summary'] = data['abstracts'][0]['abstract'] bill.update(**data['extras']) for action in data['actions']: actor = parse_psuedo_id(action['organization_id'])['classification'] bill.add_action(actor, action['description'], parse_date(action['date']), type=_action_categories(action['classification'])) # TODO: related entities for source in data['sources']: bill.add_source(source['url']) for sponsor in data['sponsorships']: bill.add_sponsor(sponsor['classification'], sponsor['name'], ) for version in data['versions']: for link in version['links']: bill.add_version(version['note'], link['url'], mimetype=link['media_type'], date=parse_date(version['date'])) for doc in data['documents']: for link in doc['links']: bill.add_document(doc['note'], link['url'], mimetype=link['media_type'], date=parse_date(doc['date'])) for title in data['other_titles']: bill.add_title(title) # TODO: related bills # for related in data['related_bills']: self.save_bill(bill)
def scrape(self, session, chambers): #get member id matching for vote parsing member_ids = self.get_member_ids()[session] per_page = 10 #seems like it gives me 10 no matter what. start_record = 0 headers = {"Content-Type":"application/json"} url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicAdvancedSearch" bill_url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicData" params = {"request":{"sEcho":2,"iColumns":4,"sColumns":"","iDisplayStart":0,"iDisplayLength":per_page,"mDataProp_0":"ShortTitle","mDataProp_1":"Title","mDataProp_2":"LegislationCategories","mDataProp_3":"Modified","iSortCol_0":0,"sSortDir_0":"asc","iSortingCols":0,"bSortable_0":"true","bSortable_1":"true","bSortable_2":"true","bSortable_3":"true"},"criteria":{"Keyword":"","Category":"","SubCategoryId":"","RequestOf":"","CouncilPeriod":str(session),"Introducer":"","CoSponsor":"","CommitteeReferral":"","CommitteeReferralComments":"","StartDate":"","EndDate":"","QueryLimit":100,"FilterType":"","Phases":"","LegislationStatus":"0","IncludeDocumentSearch":"false"}} param_json = json.dumps(params) response = self.post(url,headers=headers,data=param_json) #the response is a terrible string-of-nested-json-strings. Yuck. response = self.decode_json(response.json()["d"]) data = response["aaData"] global bill_versions while len(data) > 0: for bill in data: bill_versions = [] #sometimes they're in there more than once, so we'll keep track bill_id = bill["Title"] if bill_id.startswith("AG"): #actually an agenda, skip continue bill_params = {"legislationId":bill_id} bill_info = self.post(bill_url,headers=headers,data=json.dumps(bill_params)) bill_info = self.decode_json(bill_info.json()["d"])["data"] bill_source_url = "http://lims.dccouncil.us/Legislation/"+bill_id legislation_info = bill_info["Legislation"][0] title = legislation_info["ShortTitle"] if bill_id.startswith("R") or bill_id.startswith("CER"): bill_type = "resolution" else: bill_type = "bill" #dc has no chambers. calling it all upper bill = Bill(session,"upper", bill_id, title, type=bill_type) #sponsors and cosponsors introducers = legislation_info["Introducer"] try: #sometimes there are cosponsors, sometimes not. cosponsors = legislation_info["CoSponsor"] except KeyError: cosponsors = [] for i in introducers: sponsor_name = i["Name"] #they messed up Phil Mendelson's name if sponsor_name == "Phil Pmendelson": sponsor_name = "Phil Mendelson" bill.add_sponsor(name=sponsor_name,type="primary") for s in cosponsors: sponsor_name = s["Name"] if sponsor_name == "Phil Pmendelson": sponsor_name = "Phil Mendelson" bill.add_sponsor(name=sponsor_name,type="cosponsor") #if it's become law, add the law number as an alternate title if "LawNumber" in legislation_info: law_num = legislation_info["LawNumber"] if law_num: bill.add_title(law_num) #also sometimes it's got an act number if "ActNumber" in legislation_info: act_num = legislation_info["ActNumber"] if act_num: bill.add_title(act_num) #sometimes AdditionalInformation has a previous bill name if "AdditionalInformation" in legislation_info: add_info = legislation_info["AdditionalInformation"] if "previously" in add_info.lower(): prev_title = add_info.lower().replace("previously","").strip().replace(" ","") bill.add_title(prev_title.upper()) elif add_info: bill["additional_information"] = add_info if "WithDrawnDate" in legislation_info: withdrawn_date = self.date_format(legislation_info["WithDrawnDate"]) withdrawn_by = legislation_info["WithdrawnBy"][0]["Name"].strip() if withdrawn_by == "the Mayor": bill.add_action("executive", "withdrawn", withdrawn_date, "bill:withdrawn") elif "committee" in withdrawn_by.lower(): bill.add_action("upper", "withdrawn", withdrawn_date, "bill:withdrawn", committees=withdrawn_by) else: bill.add_action("upper", "withdrawn", withdrawn_date, "bill:withdrawn", legislators=withdrawn_by) #deal with actions involving the mayor mayor = bill_info["MayorReview"] if mayor != []: mayor = mayor[0] #in dc, mayor == governor because openstates schema if "TransmittedDate" in mayor: transmitted_date = self.date_format(mayor["TransmittedDate"]) bill.add_action("executive", "transmitted to mayor", transmitted_date, type = "governor:received") if 'SignedDate' in mayor: signed_date = self.date_format(mayor["SignedDate"]) bill.add_action("executive", "signed", signed_date, type="governor:signed") elif 'ReturnedDate' in mayor: #if returned but not signed, it was vetoed veto_date = self.date_format(mayor["ReturnedDate"]) bill.add_action("executive", "vetoed", veto_date, type="governor:vetoed") if 'EnactedDate' in mayor: #if it was returned and enacted but not signed, there was a veto override override_date = self.date_format(mayor["EnactedDate"]) bill.add_action("upper", "veto override", override_date, type="bill:veto_override:passed") if 'AttachmentPath' in mayor: #documents relating to the mayor's review self.add_documents(mayor["AttachmentPath"],bill) congress = bill_info["CongressReview"] if len(congress) > 0: congress = congress[0] if "TransmittedDate" in congress: transmitted_date = self.date_format(congress["TransmittedDate"]) bill.add_action("other", "Transmitted to Congress for review", transmitted_date) #deal with committee actions if "DateRead" in legislation_info: date = legislation_info["DateRead"] elif "IntroductionDate" in legislation_info: date = legislation_info["IntroductionDate"] else: self.logger.warning("Crap, we can't find anything that looks like an action date. Skipping") continue date = self.date_format(date) if "CommitteeReferral" in legislation_info: committees = [] for committee in legislation_info["CommitteeReferral"]: if committee["Name"].lower() == "retained by the council": committees = [] break else: committees.append(committee["Name"]) if committees != []: bill.add_action("upper", "referred to committee", date, committees=committees, type="committee:referred") if "CommitteeReferralComments" in legislation_info: committees = [] for committee in legislation_info["CommitteeReferralComments"]: committees.append(committee["Name"]) bill.add_action("upper", "comments from committee", date, committees=committees, type="other") #deal with random docs floating around docs = bill_info["OtherDocuments"] for d in docs: if "AttachmentPath" in d: self.add_documents(d["AttachmentPath"],bill) else: self.logger.warning("Document path missing from 'Other Documents'") if "MemoLink" in legislation_info: self.add_documents(legislation_info["MemoLink"],bill) if "AttachmentPath" in legislation_info: self.add_documents(legislation_info["AttachmentPath"],bill) #full council votes votes = bill_info["VotingSummary"] for vote in votes: self.process_vote(vote, bill, member_ids) #deal with committee votes if "CommitteeMarkup" in bill_info: committee_info = bill_info["CommitteeMarkup"] if len(committee_info) > 0: for committee_action in committee_info: self.process_committee_vote(committee_action,bill) if "AttachmentPath" in committee_info: self.add_documents(vote["AttachmentPath"],bill,is_version) bill.add_source(bill_source_url) self.save_bill(bill) #get next page start_record += per_page params["request"]["iDisplayStart"] = start_record param_json = json.dumps(params) response = self.post(url,headers=headers,data=param_json) response = self.decode_json(response.json()["d"]) data = response["aaData"]
def process_bill(self, data): chamber = parse_psuedo_id(data['from_organization'])['classification'] if chamber == 'legislature': chamber = 'upper' bill = Bill(data['legislative_session'], chamber, data['identifier'], data['title'], subjects=data['subject'], type=data['classification']) if data['abstracts']: bill['summary'] = data['abstracts'][0]['abstract'] bill.update(**data['extras']) for action in data['actions']: actor = parse_psuedo_id( action['organization_id'])['classification'] legislators = [] committees = [] for rel in action['related_entities']: if rel['entity_type'] == 'organization': committees.append(rel['name']) elif rel['entity_type'] == 'person': legislators.append(rel['name']) bill.add_action(actor, action['description'], parse_date(action['date']), type=_action_categories(action['classification']), committees=committees, legislators=legislators, **action.get('extras', {})) for source in data['sources']: bill.add_source(source['url']) for sponsor in data['sponsorships']: bill.add_sponsor( sponsor['classification'], sponsor['name'], ) for version in data['versions']: for link in version['links']: bill.add_version(version['note'], link['url'], mimetype=link['media_type'], date=parse_date(version['date']), **version.get('extras', {})) for doc in data['documents']: for link in doc['links']: bill.add_document(doc['note'], link['url'], mimetype=link['media_type'], date=parse_date(doc['date']), **doc.get('extras', {})) for title in data['other_titles']: bill.add_title(title['title']) for related in data['related_bills']: bill.add_companion(related['identifier'], related['legislative_session'], chamber) bill['alternate_bill_ids'] = [ oi['identifier'] for oi in data['other_identifiers'] ] self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(' ', '')) bill_page = self.get(url).text html = lxml.html.fromstring(bill_page) html.make_links_absolute('http://legislature.idaho.gov/legislation/%s/' % session) bill_tables = html.xpath('//table[contains(@class, "bill-table")]') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill['subjects'] = self._subjects[bill_id.replace(' ', '')] if short_title and bill['title'].lower() != short_title.lower(): bill.add_title(short_title) # documents doc_links = html.xpath('//div[contains(@class,"pf-content")]//a') for link in doc_links: name = link.text_content().strip() href = link.get('href') if 'Engrossment' in name or 'Bill Text' in name: bill.add_version(name, href, mimetype='application/pdf') else: bill.add_document(name, href) def _split(string): return re.split(r"\w+[,|AND]\s+", string) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split('by') if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: if 'COMMITTEE' in sponsors.upper(): bill.add_sponsor('primary', sponsors.strip()) else: for person in _split(sponsors): person = person.strip() if person != "": bill.add_sponsor('primary', person) actor = chamber last_date = None for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime(date + '/' + session[0:4], "%m/%d/%Y") if action.startswith('House'): actor = 'lower' elif action.startswith('Senate'): actor = 'upper' # votes if 'AYES' in action or 'NAYS' in action: vote = self.parse_vote(actor, date, row[2]) vote.add_source(url) bill.add_vote(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace(u'\xa0', ' ').strip() atype = get_action(actor, action) bill.add_action(actor, action, date, type=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if 'to House' in action: actor = 'lower' elif 'to Senate' in action: actor = 'upper' self.save_bill(bill)
def scrape(self, session, chambers): #get member id matching for vote parsing member_ids = self.get_member_ids()[session] per_page = 10 #seems like it gives me 10 no matter what. start_record = 0 headers = {"Content-Type": "application/json"} url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicAdvancedSearch" bill_url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicData" params = { "request": { "sEcho": 2, "iColumns": 4, "sColumns": "", "iDisplayStart": 0, "iDisplayLength": per_page, "mDataProp_0": "ShortTitle", "mDataProp_1": "Title", "mDataProp_2": "LegislationCategories", "mDataProp_3": "Modified", "iSortCol_0": 0, "sSortDir_0": "asc", "iSortingCols": 0, "bSortable_0": "true", "bSortable_1": "true", "bSortable_2": "true", "bSortable_3": "true" }, "criteria": { "Keyword": "", "Category": "", "SubCategoryId": "", "RequestOf": "", "CouncilPeriod": str(session), "Introducer": "", "CoSponsor": "", "CommitteeReferral": "", "CommitteeReferralComments": "", "StartDate": "", "EndDate": "", "QueryLimit": 100, "FilterType": "", "Phases": "", "LegislationStatus": "0", "IncludeDocumentSearch": "false" } } param_json = json.dumps(params) response = self.post(url, headers=headers, data=param_json) #the response is a terrible string-of-nested-json-strings. Yuck. response = self.decode_json(response.json()["d"]) data = response["aaData"] global bill_versions while len(data) > 0: for bill in data: bill_versions = [ ] #sometimes they're in there more than once, so we'll keep track bill_id = bill["Title"] if bill_id.startswith("AG"): #actually an agenda, skip continue bill_params = {"legislationId": bill_id} bill_info = self.post(bill_url, headers=headers, data=json.dumps(bill_params)) bill_info = self.decode_json(bill_info.json()["d"])["data"] bill_source_url = "http://lims.dccouncil.us/Legislation/" + bill_id legislation_info = bill_info["Legislation"][0] title = legislation_info["ShortTitle"] if bill_id.startswith("R") or bill_id.startswith("CER"): bill_type = "resolution" else: bill_type = "bill" #dc has no chambers. calling it all upper bill = Bill(session, "upper", bill_id, title, type=bill_type) #sponsors and cosponsors if "Introducer" in legislation_info: introducers = legislation_info["Introducer"] intro_date = self.date_format( legislation_info["IntroductionDate"]) bill.add_action("upper", "Introduced", intro_date, type="bill:introduced") else: #sometimes there are introducers, sometimes not. # Set Introducers to empty array to avoid downstream breakage, but log bills without introducers self.logger.warning("No Introducer: {0} {1}: {2}".format( bill['chamber'], bill['session'], bill['bill_id'])) introducers = [] try: #sometimes there are cosponsors, sometimes not. cosponsors = legislation_info["CoSponsor"] except KeyError: cosponsors = [] for i in introducers: sponsor_name = i["Name"] #they messed up Phil Mendelson's name if sponsor_name == "Phil Pmendelson": sponsor_name = "Phil Mendelson" bill.add_sponsor(name=sponsor_name, type="primary") for s in cosponsors: sponsor_name = s["Name"] if sponsor_name == "Phil Pmendelson": sponsor_name = "Phil Mendelson" bill.add_sponsor(name=sponsor_name, type="cosponsor") #if it's become law, add the law number as an alternate title if "LawNumber" in legislation_info: law_num = legislation_info["LawNumber"] if law_num: bill.add_title(law_num) #also sometimes it's got an act number if "ActNumber" in legislation_info: act_num = legislation_info["ActNumber"] if act_num: bill.add_title(act_num) #sometimes AdditionalInformation has a previous bill name if "AdditionalInformation" in legislation_info: add_info = legislation_info["AdditionalInformation"] if "previously" in add_info.lower(): prev_title = add_info.lower().replace( "previously", "").strip().replace(" ", "") bill.add_title(prev_title.upper()) elif add_info: bill["additional_information"] = add_info if "WithDrawnDate" in legislation_info: withdrawn_date = self.date_format( legislation_info["WithDrawnDate"]) withdrawn_by = legislation_info["WithdrawnBy"][0][ "Name"].strip() if withdrawn_by == "the Mayor": bill.add_action("executive", "withdrawn", withdrawn_date, "bill:withdrawn") elif "committee" in withdrawn_by.lower(): bill.add_action("upper", "withdrawn", withdrawn_date, "bill:withdrawn", committees=withdrawn_by) else: bill.add_action("upper", "withdrawn", withdrawn_date, "bill:withdrawn", legislators=withdrawn_by) #deal with actions involving the mayor mayor = bill_info["MayorReview"] if mayor != []: mayor = mayor[0] #in dc, mayor == governor because openstates schema if "TransmittedDate" in mayor: transmitted_date = self.date_format( mayor["TransmittedDate"]) bill.add_action("executive", "transmitted to mayor", transmitted_date, type="governor:received") if 'SignedDate' in mayor: signed_date = self.date_format(mayor["SignedDate"]) bill.add_action("executive", "signed", signed_date, type="governor:signed") elif 'ReturnedDate' in mayor: #if returned but not signed, it was vetoed veto_date = self.date_format(mayor["ReturnedDate"]) bill.add_action("executive", "vetoed", veto_date, type="governor:vetoed") if 'EnactedDate' in mayor: #if it was returned and enacted but not signed, there was a veto override override_date = self.date_format( mayor["EnactedDate"]) bill.add_action("upper", "veto override", override_date, type="bill:veto_override:passed") if 'AttachmentPath' in mayor: #documents relating to the mayor's review self.add_documents(mayor["AttachmentPath"], bill) congress = bill_info["CongressReview"] if len(congress) > 0: congress = congress[0] if "TransmittedDate" in congress: transmitted_date = self.date_format( congress["TransmittedDate"]) bill.add_action("other", "Transmitted to Congress for review", transmitted_date) #deal with committee actions if "DateRead" in legislation_info: date = legislation_info["DateRead"] elif "IntroductionDate" in legislation_info: date = legislation_info["IntroductionDate"] else: self.logger.warning( "Crap, we can't find anything that looks like an action date. Skipping" ) continue date = self.date_format(date) if "CommitteeReferral" in legislation_info: committees = [] for committee in legislation_info["CommitteeReferral"]: if committee["Name"].lower( ) == "retained by the council": committees = [] break else: committees.append(committee["Name"]) if committees != []: bill.add_action("upper", "referred to committee", date, committees=committees, type="committee:referred") if "CommitteeReferralComments" in legislation_info: committees = [] for committee in legislation_info[ "CommitteeReferralComments"]: committees.append(committee["Name"]) bill.add_action("upper", "comments from committee", date, committees=committees, type="other") #deal with random docs floating around docs = bill_info["OtherDocuments"] for d in docs: if "AttachmentPath" in d: self.add_documents(d["AttachmentPath"], bill) else: self.logger.warning( "Document path missing from 'Other Documents'") if "MemoLink" in legislation_info: self.add_documents(legislation_info["MemoLink"], bill) if "AttachmentPath" in legislation_info: self.add_documents(legislation_info["AttachmentPath"], bill) #full council votes votes = bill_info["VotingSummary"] for vote in votes: self.process_vote(vote, bill, member_ids) #deal with committee votes if "CommitteeMarkup" in bill_info: committee_info = bill_info["CommitteeMarkup"] if len(committee_info) > 0: for committee_action in committee_info: self.process_committee_vote(committee_action, bill) if "AttachmentPath" in committee_info: self.add_documents(vote["AttachmentPath"], bill, is_version) bill.add_source(bill_source_url) self.save_bill(bill) #get next page start_record += per_page params["request"]["iDisplayStart"] = start_record param_json = json.dumps(params) response = self.post(url, headers=headers, data=param_json) response = self.decode_json(response.json()["d"]) data = response["aaData"]
def scrape(self, chamber, session): # check for abiword if os.system('which abiword') != 0: raise ScrapeError('abiword is required for KS scraping') chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? with self.urlopen(ksapi.url + 'bill_status/') as bill_request: bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE'] # main bill = Bill(session, chamber, bill_id, title, type=btype, status=bill_data['STATUS']) bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if (bill_data['LONGTITLE'] and bill_data['LONGTITLE'] != bill['title']): bill.add_title(bill_data['LONGTITLE']) for sponsor in bill_data['SPONSOR_NAMES']: stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1 else 'cosponsor') bill.add_sponsor(stype, sponsor) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = datetime.datetime.strptime( event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S") # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning( 'unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = 'other' else: atype = ksapi.action_codes[event['action_code']] bill.add_action(actor, action, date, type=atype) try: self.scrape_html(bill) except scrapelib.HTTPError as e: self.warning('unable to fetch HTML for bill {0}'.format( bill['bill_id'])) self.save_bill(bill)
def scrape(self, chamber, session): # check for abiword if os.system('which abiword') != 0: raise ScrapeError('abiword is required for KS scraping') chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? with self.urlopen(ksapi.url + 'bill_status/') as bill_request: bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE'] # main bill = Bill(session, chamber, bill_id, title, type=btype, status=bill_data['STATUS']) bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if (bill_data['LONGTITLE'] and bill_data['LONGTITLE'] != bill['title']): bill.add_title(bill_data['LONGTITLE']) for sponsor in bill_data['SPONSOR_NAMES']: stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1 else 'cosponsor') bill.add_sponsor(stype, sponsor) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = datetime.datetime.strptime(event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S") # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning('unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = 'other' else: atype = ksapi.action_codes[event['action_code']] bill.add_action(actor, action, date, type=atype) try: self.scrape_html(bill) except scrapelib.HTTPError as e: self.warning('unable to fetch HTML for bill {0}'.format( bill['bill_id'])) self.save_bill(bill)
def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(' ', '')) with self.urlopen(url) as bill_page: html = lxml.html.fromstring(bill_page) html.make_links_absolute( 'http://legislature.idaho.gov/legislation/%s/' % session) bill_tables = html.xpath('./body/table/tr/td[2]')[0].xpath( './/table') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill(session, chamber, bill_id, title, type=bill_type) bill.add_source(url) bill['subjects'] = self._subjects[bill_id.replace(' ', '')] if short_title and bill['title'].lower() != short_title.lower(): bill.add_title(short_title) # documents doc_links = html.xpath('//span/a') for link in doc_links: name = link.text_content().strip() href = link.get('href') if 'Engrossment' in name or 'Bill Text' in name: bill.add_version(name, href) else: bill.add_document(name, href) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split('by') if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: for person in sponsors.split(','): bill.add_sponsor('primary', person) actor = chamber last_date = None for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime(date + '/' + session[0:4], "%m/%d/%Y") if action.startswith('House'): actor = 'lower' elif action.startswith('Senate'): actor = 'upper' # votes if 'AYES' in action or 'NAYS' in action: vote = self.parse_vote(actor, date, row[2]) vote.add_source(url) bill.add_vote(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace(u'\xa0', ' ').strip() atype = get_action(actor, action) bill.add_action(actor, action, date, type=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if 'to House' in action: actor = 'lower' elif 'to Senate' in action: actor = 'upper' self.save_bill(bill)
def scrape_current(self, chamber, term): chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? with self.urlopen(ksapi.url + 'bill_status/') as bill_request: bill_request_json = json.loads(bill_request) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' # main bill = Bill(term, chamber, bill_id, bill_data['SHORTTITLE'], type=btype, status=bill_data['STATUS']) bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if bill_data['LONGTITLE']: bill.add_title(bill_data['LONGTITLE']) for sponsor in bill_data['SPONSOR_NAMES']: stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1 else 'cosponsor') bill.add_sponsor(stype, sponsor) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = datetime.datetime.strptime( event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S") # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning( 'unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = 'other' else: atype = ksapi.action_codes[event['action_code']] bill.add_action(actor, action, date, type=atype) self.scrape_html(bill) self.save_bill(bill)
def scrape(self, chamber, session): # check for abiword if os.system("which abiword") != 0: raise ScrapeError("abiword is required for KS scraping") chamber_name = "Senate" if chamber == "upper" else "House" chamber_letter = chamber_name[0] # perhaps we should save this data so we can make one request for both? with self.urlopen(ksapi.url + "bill_status/") as bill_request: bill_request_json = json.loads(bill_request) bills = bill_request_json["content"] for bill_data in bills: bill_id = bill_data["BILLNO"] # filter other chambers if not bill_id.startswith(chamber_letter): continue if "CR" in bill_id: btype = "concurrent resolution" elif "R" in bill_id: btype = "resolution" elif "B" in bill_id: btype = "bill" title = bill_data["SHORTTITLE"] or bill_data["LONGTITLE"] # main bill = Bill(session, chamber, bill_id, title, type=btype, status=bill_data["STATUS"]) bill.add_source(ksapi.url + "bill_status/" + bill_id.lower()) if bill_data["LONGTITLE"] and bill_data["LONGTITLE"] != bill["title"]: bill.add_title(bill_data["LONGTITLE"]) for sponsor in bill_data["SPONSOR_NAMES"]: stype = "primary" if len(bill_data["SPONSOR_NAMES"]) == 1 else "cosponsor" bill.add_sponsor(stype, sponsor) # history is backwards for event in reversed(bill_data["HISTORY"]): actor = "upper" if event["chamber"] == "Senate" else "lower" date = datetime.datetime.strptime(event["occurred_datetime"], "%Y-%m-%dT%H:%M:%S") # append committee names if present if "committee_names" in event: action = event["status"] + " " + " and ".join(event["committee_names"]) else: action = event["status"] if event["action_code"] not in ksapi.action_codes: self.warning( "unknown action code on %s: %s %s" % (bill_id, event["action_code"], event["status"]) ) atype = "other" else: atype = ksapi.action_codes[event["action_code"]] bill.add_action(actor, action, date, type=atype) try: self.scrape_html(bill) except scrapelib.HTTPError as e: self.warning("unable to fetch HTML for bill {0}".format(bill["bill_id"])) self.save_bill(bill)
def _scrape_data(self, chamber, session, jsons): chamber_name = 'Senate' if chamber == 'upper' else 'House' chamber_letter = chamber_name[0] bill_request_json = json.loads(jsons) bills = bill_request_json['content'] for bill_data in bills: bill_id = bill_data['BILLNO'] if not(self.get_filter_bill_id() is False): if not bill_id == self.get_filter_bill_id(): _log.debug("Skipping bill_id %s != %s" % ( bill_id, self.get_filter_bill_id())) continue else: _log.debug( "Matched bill_id %s == %s" % ( bill_id, self.get_filter_bill_id())) else: _log.debug( "no check bill_id %s and %s" % ( bill_id, self.get_filter_bill_id()) ) # filter other chambers if not bill_id.startswith(chamber_letter): continue if 'CR' in bill_id: btype = 'concurrent resolution' elif 'R' in bill_id: btype = 'resolution' elif 'B' in bill_id: btype = 'bill' title = bill_data['SHORTTITLE'] or bill_data['LONGTITLE'] # main bill = Bill(session, chamber, bill_id, title, type=btype, status=bill_data['STATUS']) bill.add_source(ksapi.url + 'bill_status/' + bill_id.lower()) if ( bill_data['LONGTITLE'] and bill_data['LONGTITLE'] != bill['title']): bill.add_title(bill_data['LONGTITLE']) for sponsor in bill_data['SPONSOR_NAMES']: stype = ('primary' if len(bill_data['SPONSOR_NAMES']) == 1 else 'cosponsor') bill.add_sponsor(stype, sponsor) # history is backwards for event in reversed(bill_data['HISTORY']): actor = ('upper' if event['chamber'] == 'Senate' else 'lower') date = datetime.datetime.strptime( event['occurred_datetime'], "%Y-%m-%dT%H:%M:%S") # append committee names if present if 'committee_names' in event: action = (event['status'] + ' ' + ' and '.join(event['committee_names'])) else: action = event['status'] if event['action_code'] not in ksapi.action_codes: self.warning('unknown action code on %s: %s %s' % (bill_id, event['action_code'], event['status'])) atype = 'other' else: atype = ksapi.action_codes[event['action_code']] bill.add_action(actor, action, date, type=atype) try: self.scrape_html(bill) except scrapelib.HTTPError as e: self.debug(e) self.debug(bill) self.warning('unable to fetch HTML for bill {0}'.format( bill['bill_id'])) self.save_bill(bill)