def scrape(self): self.session = '2011' for i, page in enumerate(self.searchLegislation()): for legislation_summary in self.parseSearchResults(page): title = legislation_summary['Title'].strip() if title == "": continue bill = Bill(name=legislation_summary['Record #'], session=self.session, title=title, type=[legislation_summary['Type'].lower()], organization=self.jurisdiction.name) bill.add_source(legislation_summary['URL']) legislation_details = self.expandLegislationSummary( legislation_summary) for related_bill in legislation_details.get( 'Related files', []): bill.add_related_bill(name=related_bill, session=self.session, relation='other-session', chamber=None) for i, sponsor in enumerate( legislation_details.get('Sponsors', [])): if i == 0: primary = True sponsorship_type = "Primary" else: primary = False sponsorship_type = "Regular" bill.add_sponsor(sponsor, sponsorship_type, 'person', primary) for subject in legislation_details.get(u'Topics', []): bill.add_subject(subject) for attachment in legislation_details.get(u'Attachments', []): bill.add_version_link('PDF', attachment['url'], mimetype="application/pdf") yield bill
def scrape(self): self.session = '2011' for i, page in enumerate(self.searchLegislation()) : for legislation_summary in self.parseSearchResults(page) : title = legislation_summary['Title'].strip() if title == "": continue bill = Bill(name=legislation_summary['Record #'], session=self.session, title=title, type=[legislation_summary['Type'].lower()], organization=self.jurisdiction.name) bill.add_source(legislation_summary['URL']) legislation_details = self.expandLegislationSummary(legislation_summary) for related_bill in legislation_details.get('Related files', []) : bill.add_related_bill(name = related_bill, session = self.session, relation='other-session', chamber=None) for i, sponsor in enumerate(legislation_details.get('Sponsors', [])) : if i == 0 : primary = True sponsorship_type = "Primary" else : primary = False sponsorship_type = "Regular" bill.add_sponsor(sponsor, sponsorship_type, 'person', primary) for subject in legislation_details.get(u'Topics', []) : bill.add_subject(subject) for attachment in legislation_details.get(u'Attachments', []) : bill.add_version_link('PDF', attachment['url'], mimetype="application/pdf") yield bill
def old_scrape(self, session=None): status_report_url = "http://www.legislature.ohio.gov/legislation/status-reports" # ssl verification off due Ohio not correctly implementing SSL if not session: session = self.latest_session() self.info('no session, using %s', session) doc = self.get(status_report_url, verify=False).text doc = lxml.html.fromstring(doc) doc.make_links_absolute(status_report_url) xpath = "//div[contains(text(),'{}')]/following-sibling::table" status_table = doc.xpath(xpath.format(session))[0] status_links = status_table.xpath(".//a[contains(text(),'Excel')]/@href") for url in status_links: try: fname, resp = self.urlretrieve(url) except scrapelib.HTTPError as report: self.logger.warning("Missing report {}".format(report)) continue sh = xlrd.open_workbook(fname).sheet_by_index(0) # once workbook is open, we can remove tempfile os.remove(fname) for rownum in range(1, sh.nrows): bill_id = sh.cell(rownum, 0).value bill_type = "resolution" if "R" in bill_id else "bill" chamber = "lower" if "H" in bill_id else "upper" bill_title = str(sh.cell(rownum, 3).value) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type ) bill.add_source(url) bill.add_sponsor('primary', str(sh.cell(rownum, 1).value)) # add cosponsor if sh.cell(rownum, 2).value: bill.add_sponsor('cosponsor', str(sh.cell(rownum, 2).value)) actor = "" # Actions start column after bill title for colnum in range(4, sh.ncols - 1): action = str(sh.cell(0, colnum).value) cell = sh.cell(rownum, colnum) date = cell.value if len(action) != 0: if action.split()[0] == 'House': actor = "lower" elif action.split()[0] == 'Senate': actor = "upper" elif action.split()[-1] == 'Governor': actor = "executive" elif action.split()[0] == 'Gov.': actor = "executive" elif action.split()[-1] == 'Gov.': actor = "executive" if action in ('House Intro. Date', 'Senate Intro. Date'): atype = ['bill:introduced'] action = action.replace('Intro. Date', 'Introduced') elif action == '3rd Consideration': atype = ['bill:reading:3', 'bill:passed'] elif action == 'Sent to Gov.': atype = ['governor:received'] elif action == 'Signed By Governor': atype = ['governor:signed'] else: atype = ['other'] if type(date) == float: date = str(xlrd.xldate_as_tuple(date, 0)) date = datetime.datetime.strptime( date, "(%Y, %m, %d, %H, %M, %S)") date = self._tz.localize(date) date = "{:%Y-%m-%d}".format(date) bill.add_action(actor, action, date, type=atype) for idx, char in enumerate(bill_id): try: int(char) except ValueError: continue underscore_bill = bill_id[:idx]+"_"+bill_id[idx:] break yield from self.scrape_votes_old(bill, underscore_bill, session) self.scrape_versions_old(bill, underscore_bill, session) yield bill
def scrape(self, session=None): if not session: session = self.latest_session() self.info('no session specified, using %s', session) # chambers = [chamber] if chamber else ['upper','lower'] # chambers = [chamber] # if chamber else ['upper','lower'] #for chamber in chambers: # chambers = [chamber] # yield from self.scrape_chamber(session) #get member id matching for vote parsing member_ids = self.get_member_ids()[session] per_page = 10 #seems like it gives me 10 no matter what. start_record = 0 headers = {"Content-Type": "application/json"} url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicAdvancedSearch" bill_url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicData" params = { "request": { "sEcho": 2, "iColumns": 4, "sColumns": "", "iDisplayStart": 0, "iDisplayLength": per_page, "mDataProp_0": "ShortTitle", "mDataProp_1": "Title", "mDataProp_2": "LegislationCategories", "mDataProp_3": "Modified", "iSortCol_0": 0, "sSortDir_0": "asc", "iSortingCols": 0, "bSortable_0": "true", "bSortable_1": "true", "bSortable_2": "true", "bSortable_3": "true" }, "criteria": { "Keyword": "", "Category": "", "SubCategoryId": "", "RequestOf": "", "CouncilPeriod": str(session), "Introducer": "", "CoSponsor": "", "CommitteeReferral": "", "CommitteeReferralComments": "", "StartDate": "", "EndDate": "", "QueryLimit": 100, "FilterType": "", "Phases": "", "LegislationStatus": "0", "IncludeDocumentSearch": "false" } } param_json = json.dumps(params) response = self.post(url, headers=headers, data=param_json) #the response is a terrible string-of-nested-json-strings. Yuck. response = decode_json(response.json()["d"]) data = response["aaData"] global bill_versions while len(data) > 0: for bill in data: bill_versions = [ ] #sometimes they're in there more than once, so we'll keep track bill_id = bill["Title"] if bill_id.startswith("AG"): #actually an agenda, skip continue bill_params = {"legislationId": bill_id} bill_info = self.post(bill_url, headers=headers, data=json.dumps(bill_params)) bill_info = decode_json(bill_info.json()["d"])["data"] bill_source_url = "http://lims.dccouncil.us/Legislation/" + bill_id legislation_info = bill_info["Legislation"][0] title = legislation_info["ShortTitle"] if bill_id.startswith("R") or bill_id.startswith("CER"): bill_type = "resolution" else: bill_type = "bill" #dc has no chambers. calling it all upper # bill = Bill(session,"upper", bill_id, title, type=bill_type) # bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill = Bill(bill_id, legislative_session=session, title=title, classification=bill_type) #sponsors and cosponsors if "Introducer" in legislation_info: introducers = legislation_info["Introducer"] intro_date = self.date_format( legislation_info["IntroductionDate"]) # bill.add_action("upper", # "Introduced", # intro_date, # type="introduction") bill.add_action("Introduced", intro_date, chamber="upper", classification="introduction") else: #sometimes there are introducers, sometimes not. # Set Introducers to empty array to avoid downstream breakage, but log bills without introducers self.logger.warning("No Introducer: {0} {1}: {2}".format( bill['chamber'], bill['session'], bill['bill_id'])) introducers = [] try: #sometimes there are cosponsors, sometimes not. cosponsors = legislation_info["CoSponsor"] except KeyError: cosponsors = [] for i in introducers: name = i["Name"] #they messed up Phil Mendelson's name if name == "Phil Pmendelson": name = "Phil Mendelson" # bill.add_sponsor(name=sponsor_name,type="primary") bill.add_sponsorship(name, classification=cosponsors, entity_type='person', primary=True) for s in cosponsors: name = s["Name"] if name == "Phil Pmendelson": name = "Phil Mendelson" bill.add_sponsor(name=name, type="cosponsor") #if it's become law, add the law number as an alternate title if "LawNumber" in legislation_info: law_num = legislation_info["LawNumber"] if law_num: bill.add_title(law_num) #also sometimes it's got an act number if "ActNumber" in legislation_info: act_num = legislation_info["ActNumber"] if act_num: bill.add_title(act_num) #sometimes AdditionalInformation has a previous bill name if "AdditionalInformation" in legislation_info: add_info = legislation_info["AdditionalInformation"] if "previously" in add_info.lower(): prev_title = add_info.lower().replace( "previously", "").strip().replace(" ", "") bill.add_title(prev_title.upper()) elif add_info: bill["additional_information"] = add_info if "WithDrawnDate" in legislation_info: withdrawn_date = self.date_format( legislation_info["WithDrawnDate"]) withdrawn_by = legislation_info["WithdrawnBy"][0][ "Name"].strip() if withdrawn_by == "the Mayor": bill.add_action("withdrawn", withdrawn_date, chamber="executive", classification="withdrawal") elif "committee" in withdrawn_by.lower(): bill.add_action("withdrawn", withdrawn_date, chamber="upper", classification="withdrawal", committees=withdrawn_by) else: bill.add_action("withdrawn", withdrawn_date, chamber="upper", classification="withdrawal", legislators=withdrawn_by) #deal with actions involving the mayor mayor = bill_info["MayorReview"] if mayor != []: mayor = mayor[0] #in dc, mayor == governor because openstates schema if "TransmittedDate" in mayor: transmitted_date = self.date_format( mayor["TransmittedDate"]) bill.add_action("transmitted to mayor", transmitted_date, chamber="executive", classification="executive-receipt") if 'SignedDate' in mayor: signed_date = self.date_format(mayor["SignedDate"]) bill.add_action("signed", signed_date, chamber="executive", classification="executive-signature") elif 'ReturnedDate' in mayor: #if returned but not signed, it was vetoed veto_date = self.date_format(mayor["ReturnedDate"]) bill.add_action("vetoed", veto_date, chamber="executive", classification="executive-veto") if 'EnactedDate' in mayor: #if it was returned and enacted but not signed, there was a veto override override_date = self.date_format( mayor["EnactedDate"]) bill.add_action( "veto override", override_date, chamber="upper", classification="veto-override-passage") if 'AttachmentPath' in mayor: #documents relating to the mayor's review self.add_documents(mayor["AttachmentPath"], bill) congress = bill_info["CongressReview"] if len(congress) > 0: congress = congress[0] if "TransmittedDate" in congress: transmitted_date = self.date_format( congress["TransmittedDate"]) bill.add_action("Transmitted to Congress for review", transmitted_date, chamber="other") #deal with committee actions if "DateRead" in legislation_info: date = legislation_info["DateRead"] elif "IntroductionDate" in legislation_info: date = legislation_info["IntroductionDate"] else: self.logger.warning( "Crap, we can't find anything that looks like an action date. Skipping" ) continue date = self.date_format(date) if "CommitteeReferral" in legislation_info: committees = [] for committee in legislation_info["CommitteeReferral"]: if committee["Name"].lower( ) == "retained by the council": committees = [] break else: committees.append(committee["Name"]) if committees != []: bill.add_action("referred to committee", date, chamber="upper", committees=committees, classification="referral-committee") if "CommitteeReferralComments" in legislation_info: committees = [] for committee in legislation_info[ "CommitteeReferralComments"]: committees.append(committee["Name"]) bill.add_action("comments from committee", date, chamber="upper", committees=committees, classification="other") #deal with random docs floating around docs = bill_info["OtherDocuments"] for d in docs: if "AttachmentPath" in d: self.add_documents(d["AttachmentPath"], bill) else: self.logger.warning( "Document path missing from 'Other Documents'") if "MemoLink" in legislation_info: self.add_documents(legislation_info["MemoLink"], bill) if "AttachmentPath" in legislation_info: self.add_documents(legislation_info["AttachmentPath"], bill) #full council votes votes = bill_info["VotingSummary"] for vote in votes: self.process_vote(vote, bill, member_ids) #deal with committee votes if "CommitteeMarkup" in bill_info: committee_info = bill_info["CommitteeMarkup"] if len(committee_info) > 0: for committee_action in committee_info: self.process_committee_vote(committee_action, bill) if "AttachmentPath" in committee_info: self.add_documents(vote["AttachmentPath"], bill, is_version) bill.add_source(bill_source_url) self.save_bill(bill) #get next page start_record += per_page params["request"]["iDisplayStart"] = start_record param_json = json.dumps(params) response = self.post(url, headers=headers, data=param_json) response = decode_json(response.json()["d"]) data = response["aaData"]
def old_scrape(self, session=None): status_report_url = "https://www.legislature.ohio.gov/legislation/status-reports" # ssl verification off due Ohio not correctly implementing SSL if not session: session = self.latest_session() self.info('no session, using %s', session) doc = self.get(status_report_url).text doc = lxml.html.fromstring(doc) doc.make_links_absolute(status_report_url) xpath = "//div[contains(text(),'{}')]/following-sibling::table" status_table = doc.xpath(xpath.format(session))[0] status_links = status_table.xpath(".//a[contains(text(),'Excel')]/@href") for url in status_links: try: fname, resp = self.urlretrieve(url) except scrapelib.HTTPError as report: self.logger.warning("Missing report {}".format(report)) continue sh = xlrd.open_workbook(fname).sheet_by_index(0) # once workbook is open, we can remove tempfile os.remove(fname) for rownum in range(1, sh.nrows): bill_id = sh.cell(rownum, 0).value bill_type = "resolution" if "R" in bill_id else "bill" chamber = "lower" if "H" in bill_id else "upper" bill_title = str(sh.cell(rownum, 3).value) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type ) bill.add_source(url) bill.add_sponsor('primary', str(sh.cell(rownum, 1).value)) # add cosponsor if sh.cell(rownum, 2).value: bill.add_sponsor('cosponsor', str(sh.cell(rownum, 2).value)) actor = "" # Actions start column after bill title for colnum in range(4, sh.ncols - 1): action = str(sh.cell(0, colnum).value) cell = sh.cell(rownum, colnum) date = cell.value if len(action) != 0: if action.split()[0] == 'House': actor = "lower" elif action.split()[0] == 'Senate': actor = "upper" elif action.split()[-1] == 'Governor': actor = "executive" elif action.split()[0] == 'Gov.': actor = "executive" elif action.split()[-1] == 'Gov.': actor = "executive" if action in ('House Intro. Date', 'Senate Intro. Date'): atype = ['bill:introduced'] action = action.replace('Intro. Date', 'Introduced') elif action == '3rd Consideration': atype = ['bill:reading:3', 'bill:passed'] elif action == 'Sent to Gov.': atype = ['governor:received'] elif action == 'Signed By Governor': atype = ['governor:signed'] else: atype = ['other'] if type(date) == float: date = str(xlrd.xldate_as_tuple(date, 0)) date = datetime.datetime.strptime( date, "(%Y, %m, %d, %H, %M, %S)") date = self._tz.localize(date) date = "{:%Y-%m-%d}".format(date) bill.add_action(actor, action, date, type=atype) for idx, char in enumerate(bill_id): try: int(char) except ValueError: continue underscore_bill = bill_id[:idx]+"_"+bill_id[idx:] break yield from self.scrape_votes_old(bill, underscore_bill, session) self.scrape_versions_old(bill, underscore_bill, session) yield bill
def scrape(self, session=None): if not session: session = self.latest_session() self.info('no session specified, using %s', session) # chambers = [chamber] if chamber else ['upper','lower'] # chambers = [chamber] # if chamber else ['upper','lower'] #for chamber in chambers: # chambers = [chamber] # yield from self.scrape_chamber(session) #get member id matching for vote parsing member_ids = self.get_member_ids()[session] per_page = 10 #seems like it gives me 10 no matter what. start_record = 0 headers = {"Content-Type":"application/json"} url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicAdvancedSearch" bill_url = "http://lims.dccouncil.us/_layouts/15/uploader/AdminProxy.aspx/GetPublicData" params = { "request": { "sEcho":2, "iColumns":4, "sColumns":"", "iDisplayStart":0, "iDisplayLength":per_page, "mDataProp_0":"ShortTitle", "mDataProp_1":"Title", "mDataProp_2":"LegislationCategories", "mDataProp_3":"Modified", "iSortCol_0":0, "sSortDir_0":"asc", "iSortingCols":0, "bSortable_0":"true", "bSortable_1":"true", "bSortable_2":"true", "bSortable_3":"true" }, "criteria":{ "Keyword":"", "Category":"", "SubCategoryId":"", "RequestOf":"", "CouncilPeriod":str(session), "Introducer":"", "CoSponsor":"", "CommitteeReferral":"", "CommitteeReferralComments":"", "StartDate":"", "EndDate":"", "QueryLimit":100, "FilterType":"", "Phases":"", "LegislationStatus":"0", "IncludeDocumentSearch":"false" } } param_json = json.dumps(params) response = self.post(url,headers=headers,data=param_json) #the response is a terrible string-of-nested-json-strings. Yuck. response = decode_json(response.json()["d"]) data = response["aaData"] global bill_versions while len(data) > 0: for bill in data: bill_versions = [] #sometimes they're in there more than once, so we'll keep track bill_id = bill["Title"] if bill_id.startswith("AG"): #actually an agenda, skip continue bill_params = {"legislationId":bill_id} bill_info = self.post(bill_url,headers=headers,data=json.dumps(bill_params)) bill_info = decode_json(bill_info.json()["d"])["data"] bill_source_url = "http://lims.dccouncil.us/Legislation/"+bill_id legislation_info = bill_info["Legislation"][0] title = legislation_info["ShortTitle"] if bill_id.startswith("R") or bill_id.startswith("CER"): bill_type = "resolution" else: bill_type = "bill" #dc has no chambers. calling it all upper # bill = Bill(session,"upper", bill_id, title, type=bill_type) # bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill = Bill(bill_id, legislative_session=session, title=title, classification=bill_type) #sponsors and cosponsors if "Introducer" in legislation_info: introducers = legislation_info["Introducer"] intro_date = self.date_format(legislation_info["IntroductionDate"]) # bill.add_action("upper", # "Introduced", # intro_date, # type="introduction") bill.add_action("Introduced", intro_date, chamber="upper", classification="introduction") else: #sometimes there are introducers, sometimes not. # Set Introducers to empty array to avoid downstream breakage, but log bills without introducers self.logger.warning("No Introducer: {0} {1}: {2}".format(bill['chamber'], bill['session'], bill['bill_id'])) introducers = [] try: #sometimes there are cosponsors, sometimes not. cosponsors = legislation_info["CoSponsor"] except KeyError: cosponsors = [] for i in introducers: name = i["Name"] #they messed up Phil Mendelson's name if name == "Phil Pmendelson": name = "Phil Mendelson" # bill.add_sponsor(name=sponsor_name,type="primary") bill.add_sponsorship(name, classification=cosponsors, entity_type='person', primary=True) for s in cosponsors: name = s["Name"] if name == "Phil Pmendelson": name = "Phil Mendelson" bill.add_sponsor(name=name,type="cosponsor") #if it's become law, add the law number as an alternate title if "LawNumber" in legislation_info: law_num = legislation_info["LawNumber"] if law_num: bill.add_title(law_num) #also sometimes it's got an act number if "ActNumber" in legislation_info: act_num = legislation_info["ActNumber"] if act_num: bill.add_title(act_num) #sometimes AdditionalInformation has a previous bill name if "AdditionalInformation" in legislation_info: add_info = legislation_info["AdditionalInformation"] if "previously" in add_info.lower(): prev_title = add_info.lower().replace("previously","").strip().replace(" ","") bill.add_title(prev_title.upper()) elif add_info: bill["additional_information"] = add_info if "WithDrawnDate" in legislation_info: withdrawn_date = self.date_format(legislation_info["WithDrawnDate"]) withdrawn_by = legislation_info["WithdrawnBy"][0]["Name"].strip() if withdrawn_by == "the Mayor": bill.add_action("withdrawn", withdrawn_date, chamber="executive", classification="withdrawal") elif "committee" in withdrawn_by.lower(): bill.add_action("withdrawn", withdrawn_date, chamber="upper", classification="withdrawal", committees=withdrawn_by) else: bill.add_action("withdrawn", withdrawn_date, chamber="upper", classification="withdrawal", legislators=withdrawn_by) #deal with actions involving the mayor mayor = bill_info["MayorReview"] if mayor != []: mayor = mayor[0] #in dc, mayor == governor because openstates schema if "TransmittedDate" in mayor: transmitted_date = self.date_format(mayor["TransmittedDate"]) bill.add_action("transmitted to mayor", transmitted_date, chamber="executive", classification= "executive-receipt") if 'SignedDate' in mayor: signed_date = self.date_format(mayor["SignedDate"]) bill.add_action("signed", signed_date, chamber="executive", classification="executive-signature") elif 'ReturnedDate' in mayor: #if returned but not signed, it was vetoed veto_date = self.date_format(mayor["ReturnedDate"]) bill.add_action("vetoed", veto_date, chamber="executive", classification="executive-veto") if 'EnactedDate' in mayor: #if it was returned and enacted but not signed, there was a veto override override_date = self.date_format(mayor["EnactedDate"]) bill.add_action("veto override", override_date, chamber="upper", classification="veto-override-passage") if 'AttachmentPath' in mayor: #documents relating to the mayor's review self.add_documents(mayor["AttachmentPath"],bill) congress = bill_info["CongressReview"] if len(congress) > 0: congress = congress[0] if "TransmittedDate" in congress: transmitted_date = self.date_format(congress["TransmittedDate"]) bill.add_action("Transmitted to Congress for review", transmitted_date, chamber="other") #deal with committee actions if "DateRead" in legislation_info: date = legislation_info["DateRead"] elif "IntroductionDate" in legislation_info: date = legislation_info["IntroductionDate"] else: self.logger.warning("Crap, we can't find anything that looks like an action date. Skipping") continue date = self.date_format(date) if "CommitteeReferral" in legislation_info: committees = [] for committee in legislation_info["CommitteeReferral"]: if committee["Name"].lower() == "retained by the council": committees = [] break else: committees.append(committee["Name"]) if committees != []: bill.add_action("referred to committee", date, chamber="upper", committees=committees, classification="referral-committee") if "CommitteeReferralComments" in legislation_info: committees = [] for committee in legislation_info["CommitteeReferralComments"]: committees.append(committee["Name"]) bill.add_action("comments from committee", date, chamber="upper", committees=committees, classification="other") #deal with random docs floating around docs = bill_info["OtherDocuments"] for d in docs: if "AttachmentPath" in d: self.add_documents(d["AttachmentPath"],bill) else: self.logger.warning("Document path missing from 'Other Documents'") if "MemoLink" in legislation_info: self.add_documents(legislation_info["MemoLink"],bill) if "AttachmentPath" in legislation_info: self.add_documents(legislation_info["AttachmentPath"],bill) #full council votes votes = bill_info["VotingSummary"] for vote in votes: self.process_vote(vote, bill, member_ids) #deal with committee votes if "CommitteeMarkup" in bill_info: committee_info = bill_info["CommitteeMarkup"] if len(committee_info) > 0: for committee_action in committee_info: self.process_committee_vote(committee_action,bill) if "AttachmentPath" in committee_info: self.add_documents(vote["AttachmentPath"],bill,is_version) bill.add_source(bill_source_url) self.save_bill(bill) #get next page start_record += per_page params["request"]["iDisplayStart"] = start_record param_json = json.dumps(params) response = self.post(url,headers=headers,data=param_json) response = decode_json(response.json()["d"]) data = response["aaData"]