def scrape_bill(self, chamber, session, url): html = self.get(url).content page = lxml.html.fromstring(html) page.make_links_absolute(self.BASE_URL) if page.xpath('//h2[@style="font-size:1.3rem;"]/a[1]/text()'): bill_id = page.xpath( '//h2[@style="font-size:1.3rem;"]/a[1]/text()')[0].strip() elif page.xpath('//h2[@style="font-size:1.3rem;"]/text()'): bill_id = page.xpath( '//h2[@style="font-size:1.3rem;"]/text()')[0].strip() else: self.warning("No bill id for {}".format(url)) return title = page.xpath( '//dt[contains(text(), "Title")]/following-sibling::dd[1]/text()' )[0].strip() if "B" in bill_id: _type = ["bill"] elif "J" in bill_id: _type = ["joint resolution"] elif "HS" in bill_id: _type = ["resolution"] else: raise ValueError("unknown bill type " + bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=_type, ) bill.add_source(url) self.scrape_bill_subjects(bill, page) self.scrape_bill_sponsors(bill, page) self.scrape_bill_actions(bill, page) # fiscal note if page.xpath( '//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a' ): fiscal_note = page.xpath( '//dt[contains(text(), "Analysis")]/following-sibling::dd[1]/a' )[0] fiscal_url = fiscal_note.get("href") fiscal_title = fiscal_note.text_content() bill.add_document_link( fiscal_title, fiscal_url, media_type="application/pdf", ) # yield from self.parse_bill_votes_new(doc, bill) yield bill
def scrape_bill(self, chamber, session, bill_id): bill_num = bill_id.split()[1] url = ("%s/GetLegislation?biennium=%s&billNumber" "=%s" % (self._base_url, self.biennium, bill_num)) page = self.get(url) page = lxml.etree.fromstring(page.content) page = xpath(page, "//wa:Legislation")[0] xml_chamber = xpath(page, 'string(wa:OriginalAgency)') chamber = self._chamber_map[xml_chamber] title = xpath(page, "string(wa:LongDescription)") bill_type = xpath( page, "string(wa:ShortLegislationType/wa:LongLegislationType)") bill_type = bill_type.lower() if bill_type == 'gubernatorial appointment': return bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=[bill_type]) fake_source = ("http://apps.leg.wa.gov/billinfo/" "summary.aspx?bill=%s&year=%s" % (bill_num, session[0:4])) bill.add_source(fake_source) try: for version in self.versions[bill_id]: bill.add_version_link(note=version['note'], url=version['url'], media_type=version['media_type']) except KeyError: self.warning("No versions were found for {}".format(bill_id)) try: for document in self.documents[bill_num]: bill.add_document_link(note=document['note'], url=document['url'], media_type=document['media_type']) except KeyError: pass self.scrape_sponsors(bill) self.scrape_actions(bill, bill_num) self.scrape_hearings(bill, bill_num) yield from self.scrape_votes(bill) bill.subject = list(set(self._subjects[bill_id])) yield bill
def scrape_bill(self, chamber, session, bill_id): bill_num = bill_id.split()[1] url = ("%s/GetLegislation?biennium=%s&billNumber" "=%s" % (self._base_url, self.biennium, bill_num)) page = self.get(url) page = lxml.etree.fromstring(page.content) page = xpath(page, "//wa:Legislation")[0] title = xpath(page, "string(wa:LongDescription)") bill_type = xpath( page, "string(wa:ShortLegislationType/wa:LongLegislationType)") bill_type = bill_type.lower() if bill_type == 'gubernatorial appointment': return bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=[bill_type]) fake_source = ("http://apps.leg.wa.gov/billinfo/" "summary.aspx?bill=%s&year=%s" % ( bill_num, session[0:4])) bill.add_source(fake_source) try: for version in self.versions[bill_id]: bill.add_version_link(note=version['note'], url=version['url'], media_type=version['media_type']) except KeyError: self.warning("No versions were found for {}".format(bill_id)) try: for document in self.documents[bill_num]: bill.add_document_link(note=document['note'], url=document['url'], media_type=document['media_type']) except KeyError: pass self.scrape_sponsors(bill) self.scrape_actions(bill, bill_num) self.scrape_hearings(bill, bill_num) yield from self.scrape_votes(bill) bill.subject = list(set(self._subjects[bill_id])) yield bill
def scrape_chamber(self, chamber, session): chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber] url = ("http://legisweb.state.wy.us/%s/billreference/" "BillReference.aspx?type=%s" % (session, chamber_abbrev)) page = self.lxmlize(url) for tr in page.xpath( "//table[contains(@id,'cphContent_gvBills')]//tr")[1:]: bill_id = tr.xpath("string(td[1])").strip() title = tr.xpath("string(td[2])").strip() if bill_id[0:2] in ['SJ', 'HJ']: bill_type = 'joint resolution' else: bill_type = 'bill' bill = Bill(bill_id, legislative_session=session, title=title, chamber=chamber, classification=bill_type) yield from self.scrape_digest(bill, chamber) # versions for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') + tr.xpath('td[12]//a')): # skip references to other bills if a.text.startswith('See'): continue bill.add_version_link(a.text, a.get('href'), media_type='application/pdf') # documents fnote = tr.xpath('td[9]//a') if fnote: bill.add_document_link('Fiscal Note', fnote[0].get('href')) summary = tr.xpath('td[14]//a') if summary: bill.add_document_link('Summary', summary[0].get('href')) bill.add_source(url) yield bill
def scrape_chamber(self, chamber, session): chamber_abbrev = {'upper': 'SF', 'lower': 'HB'}[chamber] url = ("http://legisweb.state.wy.us/%s/billreference/" "BillReference.aspx?type=%s" % (session, chamber_abbrev)) page = self.lxmlize(url) for tr in page.xpath("//table[contains(@id,'cphContent_gvBills')]//tr")[1:]: bill_id = tr.xpath("string(td[1])").strip() title = tr.xpath("string(td[2])").strip() if bill_id[0:2] in ['SJ', 'HJ']: bill_type = 'joint resolution' else: bill_type = 'bill' bill = Bill(bill_id, legislative_session=session, title=title, chamber=chamber, classification=bill_type) yield from self.scrape_digest(bill, chamber) # versions for a in (tr.xpath('td[8]//a') + tr.xpath('td[11]//a') + tr.xpath('td[12]//a')): # skip references to other bills if a.text.startswith('See'): continue bill.add_version_link(a.text, a.get('href'), media_type='application/pdf') # documents fnote = tr.xpath('td[9]//a') if fnote: bill.add_document_link('Fiscal Note', fnote[0].get('href')) summary = tr.xpath('td[14]//a') if summary: bill.add_document_link('Summary', summary[0].get('href')) bill.add_source(url) yield bill
def scrape(self, window=28) : n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window)) for matter in self.matters(n_days_ago) : matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)) : continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name":"Board of Directors"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id) : act = bill.add_action(**action) if action['description'] == 'Referred' : body_name = matter['MatterBodyName'] act.add_related_entity(body_name, 'organization', entity_id = _make_pseudo_id(name=body_name)) result, votes = vote if result : vote_event = VoteEvent(legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes : raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get(raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id) : bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id) : bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link('Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report'.format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id) : if attachment['MatterAttachmentName'] : bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification' : matter['MatterTypeName']} text = self.text(matter_id) if text : if text['MatterTextPlain'] : bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf'] : bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '') yield bill
def scrape_bills(self, session, year_abr): # Main Bill information main_bill_csv = self.access_to_csv('MainBill') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) title = rec["Synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill( bill_id, title=title, chamber=chamber, legislative_session=session, classification=self._bill_types[bill_type[1:]], ) if rec['IdenticalBillNumber'].strip(): bill.add_related_bill( rec['IdenticalBillNumber'].split()[0], legislative_session=session, relation_type='companion', ) # TODO: last session info is in there too bill_dict[bill_id] = bill # Sponsors bill_sponsors_csv = self.access_to_csv('BillSpon') for rec in bill_sponsors_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in sponsor database' % bill_id) continue bill = bill_dict[bill_id] name = rec["Sponsor"] sponsor_type = rec["Type"] if sponsor_type == 'P': sponsor_type = "primary" else: sponsor_type = "cosponsor" bill.add_sponsorship(name, classification=sponsor_type, entity_type='person', primary=sponsor_type == 'primary') # Documents bill_document_csv = self.access_to_csv('BillWP') for rec in bill_document_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in document database' % bill_id) continue bill = bill_dict[bill_id] document = rec["Document"] document = document.split('\\') document = document[-2] + "/" + document[-1] # doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/{}/Bills/{}'.format( year_abr, document.replace('.DOC', '.HTM'), ) # name document based _doctype try: doc_name = self._doctypes[rec['DocType']] except KeyError: raise Exception('unknown doctype %s on %s' % (rec['DocType'], bill_id)) if rec['Comment']: doc_name += ' ' + rec['Comment'] # Clean HTMX links. if htm_url.endswith('HTMX'): htm_url = re.sub('X$', '', htm_url) if rec['DocType'] in self._version_types: if htm_url.endswith('HTM'): mimetype = 'text/html' elif htm_url.endswith('wpd'): mimetype = 'application/vnd.wordperfect' try: bill.add_version_link(doc_name, htm_url, media_type=mimetype) except ValueError: self.warning("Couldn't find a document for bill {}".format(bill_id)) pass else: bill.add_document_link(doc_name, htm_url) # Votes next_year = int(year_abr) + 1 vote_info_list = [ 'A%s' % year_abr, 'A%s' % next_year, 'S%s' % year_abr, 'S%s' % next_year, 'CA%s-%s' % (year_abr, next_year), 'CS%s-%s' % (year_abr, next_year), ] for filename in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename try: s_vote_zip, resp = self.urlretrieve(s_vote_url) except scrapelib.FTPError: self.warning('could not find %s' % s_vote_url) continue zippedfile = zipfile.ZipFile(s_vote_zip) for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]: try: vote_file = io.TextIOWrapper(zippedfile.open(vfile, 'rU')) except KeyError: # # Right, so, 2011 we have an "End" file with more # vote data than was in the original dump. # self.warning("No such file: %s" % (vfile)) continue vdict_file = csv.DictReader(vote_file) votes = {} if filename.startswith('A') or filename.startswith('CA'): chamber = "lower" else: chamber = "upper" if filename.startswith('C'): vote_file_type = 'committee' else: vote_file_type = 'chamber' for rec in vdict_file: if vote_file_type == 'chamber': bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] vote_parts = (bill_id, chamber, action) else: bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number']) leg = rec['Name'] # drop time portion date = rec['Agenda_Date'].split()[0] # make motion readable action = self._com_vote_motions[rec['BillAction']] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec['LegislatorVote'][0:1] committee = rec['Committee_House'] vote_parts = (bill_id, chamber, action, committee) date = datetime.strptime(date, "%m/%d/%Y") vote_id = '_'.join(vote_parts).replace(' ', '_') if bill_id[0] == 'A': b_chamber = "lower" else: b_chamber = "upper" if vote_id not in votes: votes[vote_id] = VoteEvent( start_date=TIMEZONE.localize(date), chamber=chamber, motion_text=action, classification='passage', result=None, bill=bill_id, bill_chamber=b_chamber, legislative_session=session, ) if leg_vote == "Y": votes[vote_id].vote('yes', leg) elif leg_vote == "N": votes[vote_id].vote('no', leg) else: votes[vote_id].vote('other', leg) # remove temp file os.remove(s_vote_zip) # Counts yes/no/other votes and saves overall vote for vote in votes.values(): counts = collections.defaultdict(int) for count in vote.votes: counts[count['option']] += 1 vote.set_count('yes', counts['yes']) vote.set_count('no', counts['no']) vote.set_count('other', counts['other']) # Veto override. if vote.motion_text == 'OVERRIDE': # Per the NJ leg's glossary, a veto override requires # 2/3ds of each chamber. 27 in the senate, 54 in the house. # http://www.njleg.state.nj.us/legislativepub/glossary.asp if vote.chamber == 'lower': vote.result = 'pass' if counts['yes'] >= 54 else 'fail' elif vote['chamber'] == 'upper': vote.result = 'pass' if counts['yes'] >= 27 else 'fail' else: # Regular vote. vote.result = 'pass' if counts['yes'] > counts['no'] else 'fail' vote.add_source('http://www.njleg.state.nj.us/downloads.asp') yield vote # Actions bill_action_csv = self.access_to_csv('BillHist') actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'} for rec in bill_action_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in action database' % bill_id) continue bill = bill_dict[bill_id] action = rec["Action"] date = rec["DateAction"] date = datetime.strptime(date, "%m/%d/%y %H:%M:%S") actor = actor_map[rec["House"]] comment = rec["Comment"] action, atype = self.categorize_action(action, bill_id) if comment: action += (' ' + comment) bill.add_action( action, date=TIMEZONE.localize(date), classification=atype, chamber=actor, ) # Subjects subject_csv = self.access_to_csv('BillSubj') for rec in subject_csv: bill_id = rec['BillType'].strip() + str(int(rec['BillNumber'])) if bill_id not in bill_dict: self.warning('unknown bill %s in subject database' % bill_id) continue bill = bill_dict.get(bill_id) if bill: bill.subject.append(rec['SubjectKey']) else: self.warning('invalid bill id in BillSubj: %s' % bill_id) phony_bill_count = 0 # save all bills at the end for bill in bill_dict.values(): # add sources if not bill.actions and not bill.versions: self.warning('probable phony bill detected %s', bill.identifier) phony_bill_count += 1 else: bill.add_source('http://www.njleg.state.nj.us/downloads.asp') yield bill if phony_bill_count: self.warning('%s total phony bills detected', phony_bill_count)
def scrape_bills(self, session, year_abr): # Main Bill information main_bill_csv = self.access_to_csv('MainBill') # keep a dictionary of bills (mapping bill_id to Bill obj) bill_dict = {} for rec in main_bill_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) title = rec["Synopsis"] if bill_type[0] == 'A': chamber = "lower" else: chamber = "upper" # some bills have a blank title.. just skip it if not title: continue bill = Bill( bill_id, title=title, chamber=chamber, legislative_session=session, classification=self._bill_types[bill_type[1:]], ) if rec['IdenticalBillNumber'].strip(): bill.add_related_bill( rec['IdenticalBillNumber'].split()[0], legislative_session=session, relation_type='companion', ) # TODO: last session info is in there too bill_dict[bill_id] = bill # Sponsors bill_sponsors_csv = self.access_to_csv('BillSpon') for rec in bill_sponsors_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in sponsor database' % bill_id) continue bill = bill_dict[bill_id] name = rec["Sponsor"] sponsor_type = rec["Type"] if sponsor_type == 'P': sponsor_type = "primary" else: sponsor_type = "cosponsor" bill.add_sponsorship(name, classification=sponsor_type, entity_type='person', primary=sponsor_type == 'primary') # Documents bill_document_csv = self.access_to_csv('BillWP') for rec in bill_document_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in document database' % bill_id) continue bill = bill_dict[bill_id] document = rec["Document"] document = document.split('\\') document = document[-2] + "/" + document[-1] # doc_url = "ftp://www.njleg.state.nj.us/%s/%s" % (year, document) htm_url = 'http://www.njleg.state.nj.us/{}/Bills/{}'.format( year_abr, document.replace('.DOC', '.HTM'), ) # name document based _doctype try: doc_name = self._doctypes[rec['DocType']] except KeyError: raise Exception('unknown doctype %s on %s' % (rec['DocType'], bill_id)) if rec['Comment']: doc_name += ' ' + rec['Comment'] # Clean HTMX links. if htm_url.endswith('HTMX'): htm_url = re.sub('X$', '', htm_url) if rec['DocType'] in self._version_types: if htm_url.endswith('HTM'): mimetype = 'text/html' elif htm_url.endswith('wpd'): mimetype = 'application/vnd.wordperfect' try: bill.add_version_link(doc_name, htm_url, media_type=mimetype) except ValueError: self.warning( "Couldn't find a document for bill {}".format(bill_id)) pass else: bill.add_document_link(doc_name, htm_url) # Votes next_year = int(year_abr) + 1 vote_info_list = [ 'A%s' % year_abr, 'A%s' % next_year, 'S%s' % year_abr, 'S%s' % next_year, 'CA%s-%s' % (year_abr, next_year), 'CS%s-%s' % (year_abr, next_year), ] for filename in vote_info_list: s_vote_url = 'ftp://www.njleg.state.nj.us/votes/%s.zip' % filename try: s_vote_zip, resp = self.urlretrieve(s_vote_url) except scrapelib.FTPError: self.warning('could not find %s' % s_vote_url) continue zippedfile = zipfile.ZipFile(s_vote_zip) for vfile in ["%s.txt" % (filename), "%sEnd.txt" % (filename)]: try: vote_file = io.TextIOWrapper(zippedfile.open(vfile, 'r')) except KeyError: # # Right, so, 2011 we have an "End" file with more # vote data than was in the original dump. # self.warning("No such file: %s" % (vfile)) continue vdict_file = csv.DictReader(vote_file) votes = {} if filename.startswith('A') or filename.startswith('CA'): chamber = "lower" else: chamber = "upper" if filename.startswith('C'): vote_file_type = 'committee' else: vote_file_type = 'chamber' for rec in vdict_file: if vote_file_type == 'chamber': bill_id = rec["Bill"].strip() leg = rec["Full_Name"] date = rec["Session_Date"] action = rec["Action"] leg_vote = rec["Legislator_Vote"] vote_parts = (bill_id, chamber, action) else: bill_id = '%s%s' % (rec['Bill_Type'], rec['Bill_Number']) leg = rec['Name'] # drop time portion date = rec['Agenda_Date'].split()[0] # make motion readable action = self._com_vote_motions[rec['BillAction']] # first char (Y/N) use [0:1] to ignore '' leg_vote = rec['LegislatorVote'][0:1] committee = rec['Committee_House'] vote_parts = (bill_id, chamber, action, committee) date = datetime.strptime(date, "%m/%d/%Y") vote_id = '_'.join(vote_parts).replace(' ', '_') if vote_id not in votes: votes[vote_id] = VoteEvent( start_date=TIMEZONE.localize(date), chamber=chamber, motion_text=action, classification='passage', result=None, bill=bill_dict[bill_id]) if leg_vote == "Y": votes[vote_id].vote('yes', leg) elif leg_vote == "N": votes[vote_id].vote('no', leg) else: votes[vote_id].vote('other', leg) # remove temp file os.remove(s_vote_zip) # Counts yes/no/other votes and saves overall vote for vote in votes.values(): counts = collections.defaultdict(int) for count in vote.votes: counts[count['option']] += 1 vote.set_count('yes', counts['yes']) vote.set_count('no', counts['no']) vote.set_count('other', counts['other']) # Veto override. if vote.motion_text == 'OVERRIDE': # Per the NJ leg's glossary, a veto override requires # 2/3ds of each chamber. 27 in the senate, 54 in the house. # http://www.njleg.state.nj.us/legislativepub/glossary.asp if 'lower' in vote.bill: vote.result = 'pass' if counts['yes'] >= 54 else 'fail' elif 'upper' in vote.bill: vote.result = 'pass' if counts['yes'] >= 27 else 'fail' else: # Regular vote. vote.result = 'pass' if counts['yes'] > counts[ 'no'] else 'fail' vote.add_source('http://www.njleg.state.nj.us/downloads.asp') yield vote # Actions bill_action_csv = self.access_to_csv('BillHist') actor_map = {'A': 'lower', 'G': 'executive', 'S': 'upper'} for rec in bill_action_csv: bill_type = rec["BillType"].strip() bill_number = int(rec["BillNumber"]) bill_id = bill_type + str(bill_number) if bill_id not in bill_dict: self.warning('unknown bill %s in action database' % bill_id) continue bill = bill_dict[bill_id] action = rec["Action"] date = rec["DateAction"] date = datetime.strptime(date, "%m/%d/%y %H:%M:%S") actor = actor_map[rec["House"]] comment = rec["Comment"] action, atype = self.categorize_action(action, bill_id) if comment: action += (' ' + comment) bill.add_action( action, date=TIMEZONE.localize(date), classification=atype, chamber=actor, ) # Subjects subject_csv = self.access_to_csv('BillSubj') for rec in subject_csv: bill_id = rec['BillType'].strip() + str(int(rec['BillNumber'])) if bill_id not in bill_dict: self.warning('unknown bill %s in subject database' % bill_id) continue bill = bill_dict.get(bill_id) if bill: bill.subject.append(rec['SubjectKey']) else: self.warning('invalid bill id in BillSubj: %s' % bill_id) phony_bill_count = 0 # save all bills at the end for bill in bill_dict.values(): # add sources if not bill.actions and not bill.versions: self.warning('probable phony bill detected %s', bill.identifier) phony_bill_count += 1 else: bill.add_source('http://www.njleg.state.nj.us/downloads.asp') yield bill if phony_bill_count: self.warning('%s total phony bills detected', phony_bill_count)
def _parse_house_bill(self, url, session): # using the print page makes the page simpler, and also *drastically* smaller # (8k rather than 100k) url = re.sub("billsummary", "billsummaryprn", url) url = '%s/%s' % (self._house_base_url, url) # the URL is an iframed version now, so swap in for the actual bill page url = url.replace('Bill.aspx', 'BillContent.aspx') url = url.replace('&code=R', '&code=R&style=new') # http://www.house.mo.gov/Bill.aspx?bill=HB26&year=2017&code=R # http://www.house.mo.gov/BillContent.aspx?bill=HB26&year=2017&code=R&style=new bill_page = self.get(url).text bill_page = lxml.html.fromstring(bill_page) bill_page.make_links_absolute(url) bill_id = bill_page.xpath('//*[@class="entry-title"]/div') if len(bill_id) == 0: self.info("WARNING: bill summary page is blank! (%s)" % url) self._bad_urls.append(url) return bill_id = bill_id[0].text_content() bill_id = clean_text(bill_id) bill_desc = bill_page.xpath( '//*[@class="BillDescription"]')[0].text_content() bill_desc = clean_text(bill_desc) table_rows = bill_page.xpath('//table/tr') # if there is a cosponsor all the rows are pushed down one for the extra row # for the cosponsor: cosponsorOffset = 0 if table_rows[2][0].text_content().strip() == 'Co-Sponsor:': cosponsorOffset = 1 lr_label_tag = table_rows[3 + cosponsorOffset] assert lr_label_tag[0].text_content().strip() == 'LR Number:' # bill_lr = lr_label_tag[1].text_content() lastActionOffset = 0 if table_rows[4 + cosponsorOffset][0].text_content().strip( ) == 'Governor Action:': lastActionOffset = 1 official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset] assert official_title_tag[0].text_content().strip() == 'Bill String:' official_title = official_title_tag[1].text_content() # could substitute the description for the name, # but keeping it separate for now. bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] bill_number = int(bill_id[3:].strip()) else: bill_number = int(bill_id[3:]) subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) if bill_desc == "": if bill_number <= 20: # blank bill titles early in session are approp. bills bill_desc = 'Appropriations Bill' else: self.error("Blank title. Skipping. {} / {} / {}".format( bill_id, bill_desc, official_title)) return bill = Bill( bill_id, chamber='lower', title=bill_desc, legislative_session=self._session_id, classification=bill_type, ) bill.subject = subs bill.add_title(official_title, note='official') bill.add_source(url) bill_sponsor = clean_text(table_rows[0][1].text_content()) # try: # bill_sponsor_link = table_rows[0][1][0].attrib['href'] # except IndexError: # return bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # check for cosponsors sponsors_url, = bill_page.xpath( "//a[contains(@href, 'CoSponsors.aspx')]/@href") self._parse_cosponsors_from_bill(bill, sponsors_url) # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0] # actions_link = '%s/%s' % (self._house_base_url,actions_link_tag.attrib['href']) # actions_link = re.sub("content", "print", actions_link) actions_link, = bill_page.xpath( "//a[contains(@href, 'BillActions.aspx')]/@href") yield from self._parse_house_actions(bill, actions_link) # get bill versions doc_tags = bill_page.xpath('//div[@class="BillDocuments"][1]/span') for doc_tag in reversed(doc_tags): doc = clean_text(doc_tag.text_content()) text_url = '%s%s' % (self._house_base_url, doc_tag[0].attrib['href']) bill.add_document_link(doc, text_url, media_type='text/html') # get bill versions version_tags = bill_page.xpath('//div[@class="BillDocuments"][2]/span') for version_tag in reversed(version_tags): version = clean_text(version_tag.text_content()) for vurl in version_tag.xpath(".//a"): if vurl.text == 'PDF': mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(version, vurl.attrib['href'], media_type=mimetype, on_duplicate='ignore') # house bill versions # everything between the row containing "Bill Text"" and the next div.DocHeaderRow version_rows = bill_page.xpath( '//div[contains(text(),"Bill Text")]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(preceding-sibling::div[contains(@class,"DocHeaderRow")])=1]' ) for row in version_rows: # some rows are just broken links, not real versions if row.xpath('.//div[contains(@class,"textType")]/a/@href'): version = row.xpath( './/div[contains(@class,"textType")]/a/text()')[0].strip() path = row.xpath( './/div[contains(@class,"textType")]/a/@href')[0].strip() if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(version, path, media_type=mimetype, on_duplicate='ignore') # house bill summaries # everything between the row containing "Bill Summary"" and the next div.DocHeaderRow summary_rows = bill_page.xpath( '//div[contains(text(),"Bill Summary")]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(following-sibling::div[contains(@class,"DocHeaderRow")])=1]' ) # if there are no amedments, we need a different xpath for summaries if not summary_rows: summary_rows = bill_page.xpath( '//div[contains(text(),"Bill Summary")]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(summary_rows): version = row.xpath( './/div[contains(@class,"textType")]/a/text()')[0].strip() if version: path = row.xpath( './/div[contains(@class,"textType")]/a/@href')[0].strip() summary_name = 'Bill Summary ({})'.format(version) if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_document_link(summary_name, path, media_type=mimetype, on_duplicate='ignore') # house bill amendments amendment_rows = bill_page.xpath( '//div[contains(text(),"Amendment")]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(amendment_rows): version = row.xpath( './/div[contains(@class,"DocInfoCell")]/a[1]/text()')[0].strip( ) path = row.xpath( './/div[contains(@class,"DocInfoCell")]/a[1]/@href')[0].strip( ) summary_name = 'Amendment {}'.format(version) defeated_icon = row.xpath('.//img[contains(@title,"Defeated")]') if defeated_icon: summary_name = '{} (Defeated)'.format(summary_name) adopted_icon = row.xpath('.//img[contains(@title,"Adopted")]') if adopted_icon: summary_name = '{} (Adopted)'.format(summary_name) distributed_icon = row.xpath( './/img[contains(@title,"Distributed")]') if distributed_icon: summary_name = '{} (Distributed)'.format(summary_name) if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(summary_name, path, media_type=mimetype, on_duplicate='ignore') yield bill
def scrape(self): three_days_ago = datetime.datetime.now() - datetime.timedelta(3) for matter in self.matters(three_days_ago): matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Board of Directors"}) legistar_web = self.legislation_detail_url(matter_id) legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) bill.add_version_link( 'Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report' .format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_document_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification': matter['MatterTypeName']} text = self.text(matter_id) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def get_bill(self, matter): '''Make Bill object from given matter.''' matter_id = matter['MatterId'] if matter_id in DUPLICATED_ACTIONS: return None date = matter['MatterIntroDate'] title = matter['MatterName'] identifier = matter['MatterFile'] if not all((date, title, identifier)): return None leg_type = BILL_TYPES[matter['MatterTypeName']] bill_session = self.sessions(self.toTime(date)) bill = Bill(identifier=identifier, title=title, classification=leg_type, legislative_session=bill_session, from_organization={"name": "New York City Council"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') if matter['MatterTitle']: bill.add_title(matter['MatterTitle']) if matter['MatterEXText5']: bill.add_abstract(matter['MatterEXText5'], note='') try: for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) except KeyError: self.version_errors.append(legistar_web) return None for attachment in self.attachments(matter_id): if attachment['MatterAttachmentId'] == 103315: # Duplicate return None if attachment['MatterAttachmentName']: bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type='application/pdf') for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: return None else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') try: text = self.text(matter_id) except KeyError: self.version_errors.append(legistar_web) return None bill.extras['local_classification'] = matter['MatterTypeName'] if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'].replace( u'\u0000', '') if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') return bill
def scrape_bill(self, session, bill_url): page = self.get(bill_url).text page = lxml.html.fromstring(page) page.make_links_absolute(bill_url) try: bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text except IndexError: self.logger.warning("Something is wrong with bill page, skipping.") return secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]') # checking if there is a matching bill if secondary_bill_id: secondary_bill_id = secondary_bill_id[0].text # swap ids if * is in secondary_bill_id if '*' in secondary_bill_id: bill_id, secondary_bill_id = secondary_bill_id, bill_id secondary_bill_id = secondary_bill_id.strip() secondary_bill_id = secondary_bill_id.replace(' ', ' ') bill_id = bill_id.replace('*', '').replace(' ', ' ').strip() if 'B' in bill_id: bill_type = 'bill' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' primary_chamber = 'lower' if 'H' in bill_id else 'upper' # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower' title = page.xpath("//span[@id='lblAbstract']")[0].text if title is None: msg = '%s detail page was missing title info.' self.logger.warning(msg % bill_id) return # bill subject subject_pos = title.find('-') subjects = [s.strip() for s in title[:subject_pos - 1].split(',')] subjects = filter(None, subjects) bill = Bill( bill_id, legislative_session=session, chamber=primary_chamber, title=title, classification=bill_type, ) for subject in subjects: bill.add_subject(subject) if secondary_bill_id: bill.add_identifier(secondary_bill_id) bill.add_source(bill_url) # Primary Sponsor sponsor = page.xpath("//span[@id='lblBillPrimeSponsor']")[0].text_content().split("by")[-1] sponsor = sponsor.replace('*', '').strip() if sponsor: bill.add_sponsorship( sponsor, classification='primary', entity_type='person', primary=True, ) # bill text btext = page.xpath("//span[@id='lblBillNumber']/a")[0] bill.add_version_link('Current Version', btext.get('href'), media_type='application/pdf') # documents summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]') if summary: bill.add_document_link('Summary', summary[0].get('href')) fiscal = page.xpath('//span[@id="lblFiscalNote"]//a') if fiscal: bill.add_document_link('Fiscal Note', fiscal[0].get('href')) amendments = page.xpath('//a[contains(@href, "/Amend/")]') for amendment in amendments: bill.add_document_link('Amendment ' + amendment.text, amendment.get('href')) # amendment notes in image with alt text describing doc inside <a> amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]') for afn in amend_fns: bill.add_document_link( afn.get('alt'), afn.getparent().get('href'), on_duplicate='ignore' ) # actions atable = page.xpath("//table[@id='gvBillActionHistory']")[0] actions_from_table(bill, atable) # if there is a matching bill if secondary_bill_id: # secondary sponsor secondary_sponsor = page.xpath( "//span[@id='lblCompPrimeSponsor']")[0].text_content().split("by")[-1] secondary_sponsor = secondary_sponsor.replace('*', '').replace(')', '').strip() # Skip black-name sponsors. if secondary_sponsor: bill.add_sponsorship( secondary_sponsor, classification='primary', entity_type='person', primary=True, ) # secondary actions cotable = page.xpath("//table[@id='gvCoActionHistory']")[0] actions_from_table(bill, cotable) # votes yield from self.scrape_vote_events(bill, page, bill_url) bill.actions.sort(key=lambda a: a['date']) yield bill
def scrape_bill(self, chamber, session, bill_id): # try and get bill for the first year of the session biennium url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[:4], bill_id.replace(' ', '-')) html = self.get(url).text # Otherwise, try second year of the session biennium if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[-4:], bill_id.replace(' ', '-')) html = self.get(url).text if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): self.warning("Cannot open bill page for {}; skipping".format(bill_id)) return doc = lxml.html.fromstring(html) doc.make_links_absolute('http://legislature.mi.gov') title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content() # get B/R/JR/CR part and look up bill type bill_type = bill_types[bill_id.split(' ')[0][1:]] bill = Bill(bill_id, session, title, chamber=chamber, classification=bill_type) bill.add_source(url) # sponsors sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a') for sponsor in sponsors: name = sponsor.text.replace(u'\xa0', ' ') # sometimes district gets added as a link if name.isnumeric(): continue if len(sponsors) > 1: classification = ( 'primary' if sponsor.tail and 'primary' in sponsor.tail else 'cosponsor' ) else: classification = 'primary' bill.add_sponsorship( name=name, chamber=chamber, entity_type='person', primary=classification == 'primary', classification=classification, ) bill.subject = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()') # actions (skip header) for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]: tds = row.xpath('td') # date, journal link, action date = tds[0].text_content() journal = tds[1].text_content() action = tds[2].text_content() date = TIMEZONE.localize(datetime.datetime.strptime(date, "%m/%d/%Y")) # instead of trusting upper/lower case, use journal for actor actor = 'upper' if 'SJ' in journal else 'lower' classification = categorize_action(action) bill.add_action(action, date, chamber=actor, classification=classification) # check if action mentions a sub submatch = re.search(r'WITH SUBSTITUTE\s+([\w\-\d]+)', action, re.IGNORECASE) if submatch and tds[2].xpath('a'): version_url = tds[2].xpath('a/@href')[0] version_name = tds[2].xpath('a/text()')[0].strip() version_name = 'Substitute {}'.format(version_name) self.info("Found Substitute {}".format(version_url)) if version_url.lower().endswith('.pdf'): mimetype = 'application/pdf' elif version_url.lower().endswith('.htm'): mimetype = 'text/html' bill.add_version_link(version_name, version_url, media_type=mimetype) # check if action mentions a vote rcmatch = re.search(r'Roll Call # (\d+)', action, re.IGNORECASE) if rcmatch: rc_num = rcmatch.groups()[0] # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011 journal_link = tds[1].xpath('a/@href') if journal_link: objectname = journal_link[0].rsplit('=', 1)[-1] chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor] vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % ( session, chamber_name, objectname) results = self.parse_roll_call(vote_url, rc_num) vote = VoteEvent( start_date=date, chamber=actor, bill=bill, motion_text=action, result='pass' if len(results['yes']) > len(results['no']) else 'fail', classification='passage', ) # check the expected counts vs actual count = re.search(r'YEAS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results['yes']): self.warning('vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(results['yes']))) count = re.search(r'NAYS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results['no']): self.warning('vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(results['no']))) vote.set_count('yes', len(results['yes'])) vote.set_count('no', len(results['no'])) vote.set_count('other', len(results['other'])) for name in results['yes']: vote.yes(name) for name in results['no']: vote.no(name) for name in results['other']: vote.vote('other', name) vote.add_source(vote_url) yield vote else: self.warning("missing journal link for %s %s" % (bill_id, journal)) # versions for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'): parsed = self.parse_doc_row(row) if parsed: name, url = parsed if url.endswith('.pdf'): mimetype = 'application/pdf' elif url.endswith('.htm'): mimetype = 'text/html' bill.add_version_link(name, url, media_type=mimetype) # documents for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) yield bill
def parse_bill(self, chamber, session, bill_id, url): try: page = self.lxmlize(url) except scrapelib.HTTPError as e: self.logger.warning(e) return last_action = self.parse_bill_field( page, 'Last Action').xpath('text()')[0] if 'WITHDRAWN' in last_action.upper(): self.info("{} Withdrawn, skipping".format(bill_id)) return version = self.parse_bill_field(page, 'Bill Documents') source_url = version.xpath('a[1]/@href')[0] version_title = version.xpath('a[1]/text()')[0].strip() if version is None: # Bill withdrawn self.logger.warning('Bill withdrawn.') return else: if source_url.endswith('.doc'): mimetype = 'application/msword' elif source_url.endswith('.pdf'): mimetype = 'application/pdf' title = self.parse_bill_field(page, 'Title').text_content() # actions = self.get_nodes( # page, # '//div[@class="StandardText leftDivMargin"]/' # 'div[@class="StandardText"][last()]//text()[normalize-space()]') if 'CR' in bill_id: bill_type = 'concurrent resolution' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.subject = self._subjects[bill_id] bill.add_source(url) bill.add_version_link(version_title, source_url, media_type=mimetype) self.parse_actions(page, bill, chamber) self.parse_subjects(page, bill) # LM is "Locally Mandated fiscal impact" fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]') for fiscal_note in fiscal_notes: source_url = fiscal_note.attrib['href'] if source_url.endswith('.doc'): mimetype = 'application/msword' elif source_url.endswith('.pdf'): mimetype = 'application/pdf' bill.add_document_link( "Fiscal Note", source_url, media_type=mimetype) for link in page.xpath("//td/span/a[contains(@href, 'Legislator-Profile')]"): bill.add_sponsorship(link.text.strip(), classification='primary', entity_type='person', primary=True) bdr_no = self.parse_bill_field(page, 'Bill Request Number') if bdr_no.xpath('text()'): bdr = bdr_no.xpath('text()')[0].strip() bill.extras["BDR"] = bdr yield bill
def test_full_bill(): create_jurisdiction() person = Person.objects.create(id='person-id', name='Adam Smith') org = ScrapeOrganization(name='House', classification='lower') com = ScrapeOrganization(name='Arbitrary Committee', classification='committee', parent_id=org._id) oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill.subject = ['taxes', 'axes'] bill.add_identifier('SB 9') bill.add_title('Tack & Axe Tax Act') bill.add_action('introduced in house', '1900-04-01', chamber='lower') act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower') act.add_related_entity('arbitrary committee', 'organization', com._id) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person', primary=False, entity_id=person.id) bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person', primary=True) bill.add_abstract('This is an act about axes and taxes and tacks.', note="official") bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf', media_type='application/pdf') bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html') bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html') bill.add_source('http://example.com/source') # import bill oi = OrganizationImporter('jid') oi.import_data([org.as_dict(), com.as_dict()]) pi = PersonImporter('jid') pi.json_to_db_id['person-id'] = 'person-id' # Since we have to create this person behind the back of the import # transaction, we'll fake the json-id to db-id, since they match in this # case. This is *really* getting at some implementation detail, but it's # the cleanest way to ensure we short-circut the json id lookup. BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier='HB 1') assert b.from_organization.classification == 'lower' assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ['taxes', 'axes'] assert b.abstracts.get().note == 'official' # other_title, other_identifier added assert b.other_titles.get().title == 'Tack & Axe Tax Act' assert b.other_identifiers.get().identifier == 'SB 9' # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get(classification='lower') assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert (actions[1].related_entities.get().organization == Organization.objects.get(classification='committee')) # related_bills were added rb = b.related_bills.get() assert rb.identifier == 'HB 99' # and bill got resolved assert rb.related_bill.identifier == 'HB 99' # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1
def scrape_bill(self, chamber, session, bill_id, url): page = self.lxmlize(url) (header, ) = page.xpath('//h3[@class="heading"]/text()') title = header.replace(bill_id, "").strip() if '.B. ' in bill_id: bill_type = 'bill' elif bill_id.startswith('H.R. ') or bill_id.startswith('S.R. '): bill_type = 'resolution' elif '.C.R. ' in bill_id: bill_type = 'concurrent resolution' elif '.J.R. ' in bill_id: bill_type = 'joint resolution' for flag in SUB_BLACKLIST: if flag in bill_id: bill_id = bill_id.replace(flag, " ") bill_id = re.sub("\s+", " ", bill_id).strip() bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(url) primary_info = page.xpath('//div[@id="billsponsordiv"]') for info in primary_info: (title, name) = [x.strip() for x in info.xpath('.//text()') if x.strip()] assert title == "Bill Sponsor:" name = name.replace("Sen. ", "").replace("Rep. ", "") bill.add_sponsorship(name, classification='primary', entity_type='person', primary=True) floor_info = page.xpath('//div[@id="floorsponsordiv"]//text()') floor_info = [x.strip() for x in floor_info if x.strip()] if len(floor_info) in (0, 1): # This indicates that no floor sponsor was found pass elif len(floor_info) == 2: assert floor_info[0] == "Floor Sponsor:" floor_sponsor = floor_info[1].replace("Sen. ", "").replace("Rep. ", "") bill.add_sponsorship(floor_sponsor, classification='cosponsor', entity_type='person', primary=False) else: raise AssertionError("Unexpected floor sponsor HTML found") versions = page.xpath( '//b[text()="Bill Text"]/following-sibling::ul/li/' 'a[text() and not(text()=" ")]' ) for version in versions: # sometimes the href is on the following <a> tag and the tag we # have has an onclick url = version.get('href') if not url: url = version.xpath('following-sibling::a[1]/@href')[0] bill.add_version_link( version.xpath('text()')[0].strip(), url, media_type='application/pdf' ) for related in page.xpath('//b[text()="Related Documents "]/following-sibling::ul/li/' 'a[contains(@class,"nlink")]'): href = related.xpath('@href')[0] if '.fn.pdf' in href: bill.add_document_link("Fiscal Note", href, media_type='application/pdf') else: text = related.xpath('text()')[0] bill.add_document_link(text, href, media_type='application/pdf') subjects = [] for link in page.xpath("//a[contains(@href, 'RelatedBill')]"): subjects.append(link.text.strip()) bill.subject = subjects status_table = page.xpath('//div[@id="billStatus"]//table')[0] yield from self.parse_status(bill, status_table, chamber) yield bill
def scrape_bill_page(self, chamber, session, bill_url, bill_abbreviation): page = self.lxmlize(bill_url) author = self.get_one_xpath( page, "//a[@id='ctl00_PageBody_LinkAuthor']/text()") def sbp(x): return self.scrape_bare_page(page.xpath( "//a[contains(text(), '%s')]" % (x))[0].attrib['href']) authors = [x.text for x in sbp("Authors")] try: digests = sbp("Digests") except IndexError: digests = [] try: versions = sbp("Text") except IndexError: versions = [] try: amendments = sbp("Amendments") except IndexError: amendments = [] title = page.xpath( "//span[@id='ctl00_PageBody_LabelShortTitle']/text()")[0] actions = page.xpath( "//div[@id='ctl00_PageBody_PanelBillInfo']/" "/table[@style='font-size:small']/tr") bill_id = page.xpath( "//span[@id='ctl00_PageBody_LabelBillID']/text()")[0] bill_type = self._bill_types[bill_abbreviation[1:]] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(bill_url) authors.remove(author) bill.add_sponsorship(author, classification='primary', entity_type='person', primary=True) for author in authors: bill.add_sponsorship(author, classification='cosponsor', entity_type='person', primary=False) for digest in digests: bill.add_document_link(note=digest.text, url=digest.attrib['href'], media_type="application/pdf") for version in versions: bill.add_version_link(note=version.text, url=version.attrib['href'], media_type="application/pdf") for amendment in amendments: bill.add_version_link(note=amendment.text, url=amendment.attrib['href'], media_type="application/pdf") flags = { "prefiled": ["filing"], "referred to the committee": ["referral-committee"], "sent to the house": ['passage'], "ordered returned to the house": ['passage'], "ordered to the senate": ['passage'], "signed by the governor": ['executive-signature'], "sent to the governor": ['executive-receipt'], } try: votes_link = page.xpath("//a[text() = 'Votes']")[0] yield from self.scrape_votes(bill, votes_link.attrib['href']) except IndexError: # Some bills don't have any votes pass for action in actions: date, chamber, page, text = [x.text for x in action.xpath(".//td")] session_year = self.jurisdiction.legislative_sessions[-1]['start_date'][0:4] # Session is April -> June. Prefiles look like they're in # January at earliest. date += '/{}'.format(session_year) date = dt.datetime.strptime(date, '%m/%d/%Y') chamber = self._chambers[chamber] cat = [] for flag in flags: if flag in text.lower(): cat += flags[flag] bill.add_action(description=text, date=date.strftime('%Y-%m-%d'), chamber=chamber, classification=cat) yield bill
def scrape_bill(self, session, history_url): history_xml = self.get(history_url).text root = etree.fromstring(history_xml) bill_title = root.findtext("caption") if (bill_title is None or "Bill does not exist" in history_xml): self.warning("Bill does not appear to exist") return bill_id = ' '.join(root.attrib['bill'].split(' ')[1:]) chamber = self.CHAMBERS[bill_id[0]] if bill_id[1] == 'B': bill_type = ['bill'] elif bill_id[1] == 'R': bill_type = ['resolution'] elif bill_id[1:3] == 'CR': bill_type = ['concurrent resolution'] elif bill_id[1:3] == 'JR': bill_type = ['joint resolution'] else: raise ScrapeError("Invalid bill_id: %s" % bill_id) bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=bill_title, classification=bill_type ) bill.add_source(history_url) for subject in root.iterfind('subjects/subject'): bill.add_subject(subject.text.strip()) versions = [x for x in self.versions if x[0] == bill_id] for version in versions: bill.add_version_link( note=self.NAME_SLUGS[version[1][-5]], url=version[1], media_type='text/html' ) analyses = [x for x in self.analyses if x[0] == bill_id] for analysis in analyses: bill.add_document_link( note="Analysis ({})".format(self.NAME_SLUGS[analysis[1][-5]]), url=analysis[1], media_type='text/html' ) fiscal_notes = [x for x in self.fiscal_notes if x[0] == bill_id] for fiscal_note in fiscal_notes: bill.add_document_link( note="Fiscal Note ({})".format(self.NAME_SLUGS [fiscal_note[1][-5]]), url=fiscal_note[1], media_type='text/html' ) witnesses = [x for x in self.witnesses if x[0] == bill_id] for witness in witnesses: bill.add_document_link( note="Witness List ({})".format(self.NAME_SLUGS [witness[1][-5]]), url=witness[1], media_type='text/html' ) for action in root.findall('actions/action'): act_date = datetime.datetime.strptime(action.findtext('date'), "%m/%d/%Y").date() action_number = action.find('actionNumber').text actor = {'H': 'lower', 'S': 'upper', 'E': 'executive'}[action_number[0]] desc = action.findtext('description').strip() if desc == 'Scheduled for public hearing on . . .': self.warning("Skipping public hearing action with no date") continue introduced = False if desc == 'Amended': atype = 'amendment-passage' elif desc == 'Amendment(s) offered': atype = 'amendment-introduction' elif desc == 'Amendment amended': atype = 'amendment-amendment' elif desc == 'Amendment withdrawn': atype = 'amendment-withdrawal' elif desc == 'Passed' or desc == 'Adopted': atype = 'passage' elif re.match(r'^Received (by|from) the', desc): if 'Secretary of the Senate' not in desc: atype = 'introduction' else: atype = 'filing' elif desc.startswith('Sent to the Governor'): # But what if it gets lost in the mail? atype = 'executive-receipt' elif desc.startswith('Signed by the Governor'): atype = 'executive-signature' elif desc == 'Vetoed by the Governor': atype = 'executive-veto' elif desc == 'Read first time': atype = ['introduction', 'reading-1'] introduced = True elif desc == 'Read & adopted': atype = ['passage'] if not introduced: introduced = True atype.append('introduction') elif desc == "Passed as amended": atype = 'passage' elif (desc.startswith('Referred to') or desc.startswith("Recommended to be sent to ")): atype = 'referral-committee' elif desc == "Reported favorably w/o amendment(s)": atype = 'committee-passage' elif desc == "Filed": atype = 'filing' elif desc == 'Read 3rd time': atype = 'reading-3' elif desc == 'Read 2nd time': atype = 'reading-2' elif desc.startswith('Reported favorably'): atype = 'committee-passage-favorable' else: atype = None act = bill.add_action( action.findtext('description'), act_date, chamber=actor, classification=atype ) if atype and 'referral-committee' in atype: repls = [ 'Referred to', "Recommended to be sent to " ] ctty = desc for r in repls: ctty = ctty.replace(r, "").strip() act.add_related_entity(name=ctty, entity_type='organization') for author in root.findtext('authors').split(' | '): if author != "": bill.add_sponsorship(author, classification='primary', entity_type='person', primary=True) for coauthor in root.findtext('coauthors').split(' | '): if coauthor != "": bill.add_sponsorship(coauthor, classification='cosponsor', entity_type='person', primary=False) for sponsor in root.findtext('sponsors').split(' | '): if sponsor != "": bill.add_sponsorship(sponsor, classification='primary', entity_type='person', primary=True) for cosponsor in root.findtext('cosponsors').split(' | '): if cosponsor != "": bill.add_sponsorship(cosponsor, classification='cosponsor', entity_type='person', primary=False) if root.findtext('companions'): self._get_companion(bill) yield bill
def scrape_bill(self, bill_num, session): chamber_map = {"House": "lower", "Senate": "upper", "LSO": "executive"} # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45 bill_json_url = ("http://wyoleg.gov/LsoService/api/BillInformation/{}/" "{}?calendarDate=".format(session, bill_num)) response = self.get(bill_json_url) bill_json = json.loads(response.content.decode("utf-8")) chamber = "lower" if bill_json["bill"][0] else "upper" bill = Bill( identifier=bill_json["bill"], legislative_session=session, title=bill_json["catchTitle"], chamber=chamber, classification="bill", ) bill.add_title(bill_json["billTitle"]) source_url = "http://lso.wyoleg.gov/Legislation/{}/{}".format( session, bill_json["bill"]) bill.add_source(source_url) for action_json in bill_json["billActions"]: utc_action_date = self.parse_local_date(action_json["statusDate"]) actor = None if action_json["location"] and action_json[ "location"] in chamber_map: actor = chamber_map[action_json["location"]] action = bill.add_action( chamber=actor, description=action_json["statusMessage"], date=utc_action_date, classification=categorize_action(action_json["statusMessage"]), ) action.extras = { "billInformationID": action_json["billInformationID"] } if bill_json["introduced"]: url = "http://wyoleg.gov/{}".format(bill_json["introduced"]) bill.add_version_link( note="Introduced", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["enrolledAct"]: url = "http://wyoleg.gov/{}".format(bill_json["enrolledAct"]) bill.add_version_link( note="Enrolled", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["fiscalNote"]: url = "http://wyoleg.gov/{}".format(bill_json["fiscalNote"]) bill.add_document_link( note="Fiscal Note", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["digest"]: url = "http://wyoleg.gov/{}".format(bill_json["digest"]) bill.add_document_link( note="Bill Digest", url=url, media_type="application/pdf", # optional but useful! ) if bill_json["vetoes"]: for veto in bill_json["vetoes"]: url = "http://wyoleg.gov/{}".format(veto["vetoLinkPath"]) bill.add_version_link( note=veto["vetoLinkText"], url=url, media_type="application/pdf", # optional but useful! ) for amendment in bill_json["amendments"]: # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf url = "http://wyoleg.gov/{}/Amends/{}.pdf".format( session, amendment["amendmentNumber"]) if amendment["sponsor"] and amendment["status"]: title = "Amendment {} ({}) - {} ({})".format( amendment["amendmentNumber"], amendment["order"], amendment["sponsor"], amendment["status"], ) else: title = "Amendment {} ({})".format( amendment["amendmentNumber"], amendment["order"]) # add versions of the bill text version = bill.add_version_link(note=title, url=url, media_type="application/pdf") version["extras"] = { "amendmentNumber": amendment["amendmentNumber"], "sponsor": amendment["sponsor"], } for sponsor in bill_json["sponsors"]: status = "primary" if sponsor["primarySponsor"] else "cosponsor" sponsor_type = "person" if sponsor[ "sponsorTitle"] else "organization" bill.add_sponsorship( name=sponsor["name"], classification=status, entity_type=sponsor_type, primary=sponsor["primarySponsor"], ) if bill_json["summary"]: bill.add_abstract(note="summary", abstract=bill_json["summary"]) if bill_json["enrolledNumber"]: bill.extras["wy_enrolled_number"] = bill_json["enrolledNumber"] if bill_json["chapter"]: bill.extras["chapter"] = bill_json["chapter"] if bill_json["effectiveDate"]: eff = datetime.datetime.strptime(bill_json["effectiveDate"], "%m/%d/%Y") bill.extras["effective_date"] = eff.strftime("%Y-%m-%d") bill.extras["wy_bill_id"] = bill_json["id"] for vote_json in bill_json["rollCalls"]: yield from self.scrape_vote(bill, vote_json, session) yield bill
def scrape_bill(self, bill_num, session): chamber_map = {'House': 'lower', 'Senate': 'upper', 'LSO': 'executive'} # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45 bill_json_url = 'http://wyoleg.gov/LsoService/api/BillInformation/{}/' \ '{}?calendarDate='.format( session, bill_num) response = self.get(bill_json_url) bill_json = json.loads(response.content.decode('utf-8')) chamber = 'lower' if bill_json['bill'][0] else 'upper' bill = Bill(identifier=bill_json['bill'], legislative_session=session, title=bill_json['catchTitle'], chamber=chamber, classification="bill", ) bill.add_title(bill_json['billTitle']) source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format(session, bill_json['bill']) bill.add_source(source_url) for action_json in bill_json['billActions']: utc_action_date = self.parse_local_date(action_json['statusDate']) actor = None if action_json['location'] and action_json['location'] in chamber_map: actor = chamber_map[action_json['location']] action = bill.add_action( chamber=actor, description=action_json['statusMessage'], date=utc_action_date, classification=categorize_action(action_json['statusMessage']), ) action.extras = { 'billInformationID': action_json['billInformationID']} if bill_json['introduced']: url = 'http://wyoleg.gov/{}'.format(bill_json['introduced']) bill.add_version_link(note="Introduced", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['enrolledAct']: url = 'http://wyoleg.gov/{}'.format(bill_json['enrolledAct']) bill.add_version_link(note="Enrolled", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['fiscalNote']: url = 'http://wyoleg.gov/{}'.format(bill_json['fiscalNote']) bill.add_document_link(note="Fiscal Note", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['digest']: url = 'http://wyoleg.gov/{}'.format(bill_json['digest']) bill.add_document_link(note="Bill Digest", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['vetoes']: for veto in bill_json['vetoes']: url = 'http://wyoleg.gov/{}'.format(veto['vetoLinkPath']) bill.add_version_link(note=veto['vetoLinkText'], url=url, media_type="application/pdf" # optional but useful! ) for amendment in bill_json['amendments']: # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf url = 'http://wyoleg.gov/{}/Amends/{}.pdf'.format( session, amendment['amendmentNumber']) if amendment['sponsor'] and amendment['status']: title = 'Amendment {} ({}) - {} ({})'.format( amendment['amendmentNumber'], amendment['order'], amendment['sponsor'], amendment['status'], ) else: title = 'Amendment {} ({})'.format( amendment['amendmentNumber'], amendment['order'], ) # add versions of the bill text version = bill.add_version_link( note=title, url=url, media_type="application/pdf", ) version['extras'] = { 'amendmentNumber': amendment['amendmentNumber'], 'sponsor': amendment['sponsor'], } for sponsor in bill_json['sponsors']: status = 'primary' if sponsor['primarySponsor'] else 'cosponsor' sponsor_type = 'person' if sponsor['sponsorTitle'] else 'organization' bill.add_sponsorship( name=sponsor['name'], classification=status, entity_type=sponsor_type, primary=sponsor['primarySponsor'] ) if bill_json['summary']: bill.add_abstract( note="summary", abstract=bill_json['summary'], ) if bill_json['enrolledNumber']: bill.extras['wy_enrolled_number'] = bill_json['enrolledNumber'] if bill_json['chapter']: bill.extras['chapter'] = bill_json['chapter'] if bill_json['effectiveDate']: eff = datetime.datetime.strptime( bill_json['effectiveDate'], '%m/%d/%Y') bill.extras['effective_date'] = eff.strftime('%Y-%m-%d') bill.extras['wy_bill_id'] = bill_json['id'] for vote_json in bill_json['rollCalls']: yield from self.scrape_vote(bill, vote_json, session) yield bill
def scrape_events_range(self, start_date, end_date): def daterange(start_date, end_date): number_of_days = int((end_date - start_date).days) for n in range(number_of_days): yield start_date + dt.timedelta(n) for date in daterange(start_date, end_date): events = self.extract_events_by_day(date) for event in events: tz = pytz.timezone("America/Toronto") time = dt.datetime.strptime(event['time'], '%I:%M %p') start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0)) source_url = CALENDAR_DAY_TEMPLATE.format(start.year, start.month, start.day) org_name = event['meeting'] e = Event( name = org_name, start_time = start, timezone = tz.zone, location_name = event['location'], status=STATUS_DICT.get(event['meeting_status']) ) e.add_source(source_url) e.extras = { 'meeting_number': event['no'], 'tmmis_meeting_id': event['meeting_id'], } e.add_participant( name = org_name, type = 'organization', ) def is_agenda_available(event): return event['publishing_status'] in ['Agenda Published', 'Minutes Published'] def is_council(event): return True if event['meeting'] == self.jurisdiction.name else False if is_agenda_available(event): template = AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE agenda_url = template.format(event['meeting_id']) full_identifiers = list(self.full_identifiers(event['meeting_id'], is_council(event))) e.add_source(agenda_url) agenda_items = self.agenda_from_url(agenda_url) for i, item in enumerate(agenda_items): a = e.add_agenda_item(item['title']) a.add_classification(item['type'].lower()) a['order'] = str(i) def normalize_wards(raw): if not raw: raw = 'All' if raw == 'All': return raw.lower() else: return raw.split(', ') wards = normalize_wards(item['wards']) identifier_regex = re.compile(r'^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$') [full_identifier] = [id for id in full_identifiers if identifier_regex.match(id).group(1) == item['identifier']] a.add_bill(full_identifier) if full_identifier not in self.seen_agenda_items: b = Bill( # TODO: Fix this hardcode legislative_session = '2014-2018', identifier = full_identifier, title = item['title'], from_organization = {'name': self.jurisdiction.name}, ) b.add_source(agenda_url) b.add_document_link(note='canonical', media_type='text/html', url=AGENDA_ITEM_TEMPLATE.format(full_identifier)) b.extras = { 'wards': wards, } self.seen_agenda_items.append(full_identifier) yield b yield e
def scrape_bill(self, chamber, session, bill_id, bill_type, url): doc = lxml.html.fromstring(self.get(url).text) doc.make_links_absolute(url) title = doc.xpath('//b[text()="TITLE:"]') if title: title = title[0].tail.strip().strip('"') else: self.warning("skipping bill %s, no information" % url) return bill = Bill( bill_id, title=title, chamber=chamber, classification=bill_type, legislative_session=session, ) bill.add_source(url) # Get sponsors spons_str = doc.xpath( '//b[contains(text(), "SPONSOR")]')[0].tail.strip() sponsors_match = re.match( '(SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str) if sponsors_match: sponsors = sponsors_match.group(2).split(',') sponsor = sponsors[0].strip() if sponsor: bill.add_sponsorship( sponsors[0], entity_type='person', classification='primary', primary=True, ) for sponsor in sponsors[1:]: sponsor = sponsor.strip() if sponsor: bill.add_sponsorship( sponsor, entity_type='person', classification='cosponsor', primary=False, ) else: # Committee sponsorship spons_str = spons_str.strip() if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str): spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$', '', spons_str).title() spons_str = (spons_str + " Committee (by request of the governor)") if spons_str: bill.add_sponsorship( spons_str, entity_type='person', classification='primary', primary=True, ) # Get actions from second myth table self._current_comm = None act_rows = doc.xpath('(//table[@class="myth"])[2]//tr')[1:] for row in act_rows: date, journal, raw_chamber, action = row.xpath('td') act_date = datetime.datetime.strptime(date.text_content().strip(), '%m/%d/%y') raw_chamber = raw_chamber.text_content().strip() action = action.text_content().strip() if raw_chamber == "(H)": act_chamber = "lower" elif raw_chamber == "(S)": act_chamber = "upper" if re.match("\w+ Y(\d+)", action): vote_href = journal.xpath('.//a/@href') if vote_href: yield from self.parse_vote(bill, action, act_chamber, act_date, vote_href[0]) action, atype = self.clean_action(action) match = re.match('^Prefile released (\d+/\d+/\d+)$', action) if match: action = 'Prefile released' act_date = datetime.datetime.strptime(match.group(1), '%m/%d/%y') bill.add_action(action, chamber=act_chamber, date=act_date.strftime('%Y-%m-%d'), classification=atype) # Get subjects for subj in doc.xpath('//a[contains(@href, "subject")]/text()'): bill.add_subject(subj.strip()) # Get versions text_list_url = ("http://www.legis.state.ak.us/" "basis/get_fulltext.asp?session=%s&bill=%s") % ( session, bill_id) bill.add_source(text_list_url) text_doc = lxml.html.fromstring(self.get(text_list_url).text) text_doc.make_links_absolute(text_list_url) for link in text_doc.xpath('//a[contains(@href, "get_bill_text")]'): name = link.xpath('../preceding-sibling::td/text()')[0].strip() text_url = link.get('href') bill.add_version_link(name, text_url, media_type="text/html") # Get documents doc_list_url = ("http://www.legis.state.ak.us/" "basis/get_documents.asp?session=%s&bill=%s") % ( session, bill_id) doc_list = lxml.html.fromstring(self.get(doc_list_url).text) doc_list.make_links_absolute(doc_list_url) bill.add_source(doc_list_url) for href in doc_list.xpath( '//a[contains(@href, "get_documents")][@onclick]'): h_name = href.text_content() h_href = href.attrib['href'] if h_name.strip(): bill.add_document_link(h_name, h_href) yield bill
def _parse_house_bill(self, url, session): # using the print page makes the page simpler, and also *drastically* smaller # (8k rather than 100k) url = re.sub("billsummary", "billsummaryprn", url) url = '%s/%s' % (self._house_base_url, url) # the URL is an iframed version now, so swap in for the actual bill page url = url.replace('Bill.aspx', 'BillContent.aspx') url = url.replace('&code=R', '&code=R&style=new') # http://www.house.mo.gov/Bill.aspx?bill=HB26&year=2017&code=R # http://www.house.mo.gov/BillContent.aspx?bill=HB26&year=2017&code=R&style=new bill_page = self.get(url).text bill_page = lxml.html.fromstring(bill_page) bill_page.make_links_absolute(url) bill_id = bill_page.xpath('//*[@class="entry-title"]/div') if len(bill_id) == 0: self.info("WARNING: bill summary page is blank! (%s)" % url) self._bad_urls.append(url) return bill_id = bill_id[0].text_content() bill_id = clean_text(bill_id) bill_desc = bill_page.xpath('//*[@class="BillDescription"]')[0].text_content() bill_desc = clean_text(bill_desc) table_rows = bill_page.xpath('//table/tr') # if there is a cosponsor all the rows are pushed down one for the extra row # for the cosponsor: cosponsorOffset = 0 if table_rows[2][0].text_content().strip() == 'Co-Sponsor:': cosponsorOffset = 1 lr_label_tag = table_rows[3 + cosponsorOffset] assert lr_label_tag[0].text_content().strip() == 'LR Number:' # bill_lr = lr_label_tag[1].text_content() lastActionOffset = 0 if table_rows[4 + cosponsorOffset][0].text_content().strip() == 'Governor Action:': lastActionOffset = 1 official_title_tag = table_rows[5 + cosponsorOffset + lastActionOffset] assert official_title_tag[0].text_content().strip() == 'Bill String:' official_title = official_title_tag[1].text_content() # could substitute the description for the name, # but keeping it separate for now. bill_type = "bill" triplet = bill_id[:3] if triplet in bill_types: bill_type = bill_types[triplet] bill_number = int(bill_id[3:].strip()) else: bill_number = int(bill_id[3:]) subs = [] bid = bill_id.replace(" ", "") if bid in self._subjects: subs = self._subjects[bid] self.info("With subjects for this bill") self.info(bid) if bill_desc == "": if bill_number <= 20: # blank bill titles early in session are approp. bills bill_desc = 'Appropriations Bill' else: self.error("Blank title. Skipping. {} / {} / {}".format( bill_id, bill_desc, official_title )) return bill = Bill( bill_id, chamber='lower', title=bill_desc, legislative_session=self._session_id, classification=bill_type, ) bill.subject = subs bill.add_title(official_title, note='official') bill.add_source(url) bill_sponsor = clean_text(table_rows[0][1].text_content()) # try: # bill_sponsor_link = table_rows[0][1][0].attrib['href'] # except IndexError: # return bill.add_sponsorship( bill_sponsor, entity_type='person', classification='primary', primary=True, ) # check for cosponsors sponsors_url, = bill_page.xpath( "//a[contains(@href, 'CoSponsors.aspx')]/@href") self._parse_cosponsors_from_bill(bill, sponsors_url) # actions_link_tag = bill_page.xpath('//div[@class="Sections"]/a')[0] # actions_link = '%s/%s' % (self._house_base_url,actions_link_tag.attrib['href']) # actions_link = re.sub("content", "print", actions_link) actions_link, = bill_page.xpath( "//a[contains(@href, 'BillActions.aspx')]/@href") yield from self._parse_house_actions(bill, actions_link) # get bill versions doc_tags = bill_page.xpath('//div[@class="BillDocuments"][1]/span') for doc_tag in reversed(doc_tags): doc = clean_text(doc_tag.text_content()) text_url = '%s%s' % ( self._house_base_url, doc_tag[0].attrib['href'] ) bill.add_document_link(doc, text_url, media_type='text/html') # get bill versions version_tags = bill_page.xpath('//div[@class="BillDocuments"][2]/span') for version_tag in reversed(version_tags): version = clean_text(version_tag.text_content()) for vurl in version_tag.xpath(".//a"): if vurl.text == 'PDF': mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(version, vurl.attrib['href'], media_type=mimetype, on_duplicate='ignore') # house bill versions # everything between the row containing "Bill Text"" and the next div.DocHeaderRow version_rows = bill_page.xpath( '//div[contains(text(),"Bill Text")]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(preceding-sibling::div[contains(@class,"DocHeaderRow")])=1]') for row in version_rows: # some rows are just broken links, not real versions if row.xpath('.//div[contains(@class,"textType")]/a/@href'): version = row.xpath('.//div[contains(@class,"textType")]/a/text()')[0].strip() path = row.xpath('.//div[contains(@class,"textType")]/a/@href')[0].strip() if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(version, path, media_type=mimetype, on_duplicate='ignore') # house bill summaries # everything between the row containing "Bill Summary"" and the next div.DocHeaderRow summary_rows = bill_page.xpath( '//div[contains(text(),"Bill Summary")]/' 'following-sibling::div[contains(@class,"DocRow") ' 'and count(following-sibling::div[contains(@class,"DocHeaderRow")])=1]') # if there are no amedments, we need a different xpath for summaries if not summary_rows: summary_rows = bill_page.xpath( '//div[contains(text(),"Bill Summary")]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(summary_rows): version = row.xpath('.//div[contains(@class,"textType")]/a/text()')[0].strip() if version: path = row.xpath('.//div[contains(@class,"textType")]/a/@href')[0].strip() summary_name = 'Bill Summary ({})'.format(version) if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_document_link(summary_name, path, media_type=mimetype, on_duplicate='ignore') # house bill amendments amendment_rows = bill_page.xpath('//div[contains(text(),"Amendment")]/' 'following-sibling::div[contains(@class,"DocRow")]') for row in reversed(amendment_rows): version = row.xpath('.//div[contains(@class,"DocInfoCell")]/a[1]/text()')[0].strip() path = row.xpath('.//div[contains(@class,"DocInfoCell")]/a[1]/@href')[0].strip() summary_name = 'Amendment {}'.format(version) defeated_icon = row.xpath('.//img[contains(@title,"Defeated")]') if defeated_icon: summary_name = '{} (Defeated)'.format(summary_name) adopted_icon = row.xpath('.//img[contains(@title,"Adopted")]') if adopted_icon: summary_name = '{} (Adopted)'.format(summary_name) distributed_icon = row.xpath('.//img[contains(@title,"Distributed")]') if distributed_icon: summary_name = '{} (Distributed)'.format(summary_name) if '.pdf' in path: mimetype = 'application/pdf' else: mimetype = 'text/html' bill.add_version_link(summary_name, path, media_type=mimetype, on_duplicate='ignore') yield bill
def bill_info(self, bill_link, session, main_url): bill_page = self.lxmlize(bill_link) long_title = self.get_node( bill_page, '//div[@class="main-content"]/div[1]/div/h2').text.split() bill_number = long_title[0] title = '' for x in range(2, len(long_title)): title += long_title[x] + ' ' title = title[0:-1] if not title: self.error('no title, skipping %s', bill_number) return bill_type = 'resolution' if 'LR' in bill_number else 'bill' bill = Bill(bill_number, session, title, classification=bill_type) bill.add_source(main_url) bill.add_source(bill_link) introduced_by = self.get_node( bill_page, '//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/a[1]/text()') if not introduced_by: introduced_by = self.get_node( bill_page, '//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/text()') introduced_by = introduced_by.split('Introduced By:')[1].strip() bill.add_sponsorship( name=introduced_by, entity_type='person', primary=True, classification='primary', ) action_nodes = self.get_nodes( bill_page, '//div[@class="main-content"]/div[5]//table/tbody/tr') for action_node in action_nodes: date = self.get_node( action_node, './td[1]').text date = datetime.strptime(date, '%b %d, %Y') # The action node may have an anchor element within it, so # we grab all the text within. action = self.get_node( action_node, './td[2]').text_content() if 'Governor' in action: actor = 'executive' elif 'Speaker' in action: actor = 'legislature' else: actor = 'legislature' action_type = self.action_types(action) bill.add_action( action, date.strftime('%Y-%m-%d'), chamber=actor, classification=action_type, ) # Were in reverse chronological order. bill.actions.reverse() # Grabs bill version documents. version_links = self.get_nodes( bill_page, '//div[@class="main-content"]/div[3]/div[2]/' 'div[@class="hidden-xs"]/ul[1]/li/a') for version_link in version_links: version_name = version_link.text version_url = version_link.attrib['href'] # replace Current w/ session number version_url = version_url.replace('Current', session) bill.add_version_link(version_name, version_url, media_type='application/pdf') # Adds any documents related to amendments. amendment_links = self.get_nodes( bill_page, '//div[@class="main-content"]/div[5]/div[2]/table/tr/td[1]/a') for amendment_link in amendment_links: amendment_name = amendment_link.text amendment_url = amendment_link.attrib['href'] bill.add_document_link(amendment_name, amendment_url) # Related transcripts. transcript_links = self.get_nodes( bill_page, '//div[@class="main-content"]/div[5]/div[2]/' 'div[@class="hidden-xs"]/table/tr/td/a') for transcript_link in transcript_links: transcript_name = transcript_link.text transcript_url = transcript_link.attrib['href'] bill.add_document_link(transcript_name, transcript_url) yield bill yield from self.scrape_votes(bill, bill_page, actor)
def bill_info(self, bill_link, session, main_url): bill_page = self.lxmlize(bill_link) long_title = self.get_node( bill_page, '//div[@class="main-content"]/div[1]/div/h2').text.split() bill_number = long_title[0] title = '' for x in range(2, len(long_title)): title += long_title[x] + ' ' title = title[0:-1] if not title: self.error('no title, skipping %s', bill_number) return bill_type = 'resolution' if 'LR' in bill_number else 'bill' bill = Bill(bill_number, session, title, classification=bill_type) bill.add_source(main_url) bill.add_source(bill_link) introduced_by = self.get_node( bill_page, '//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/a[1]/text()') if not introduced_by: introduced_by = self.get_node( bill_page, '//div[@class="main-content"]/div[3]/div[1]/ul/li[1]/text()') introduced_by = introduced_by.split('Introduced By:')[1].strip() bill.add_sponsorship( name=introduced_by, entity_type='person', primary=True, classification='primary', ) action_nodes = self.get_nodes( bill_page, '//div[@class="main-content"]/div[5]//table/tbody/tr') for action_node in action_nodes: date = self.get_node( action_node, './td[1]').text date = datetime.strptime(date, '%b %d, %Y') # The action node may have an anchor element within it, so # we grab all the text within. action = self.get_node( action_node, './td[2]').text_content() if 'Governor' in action: actor = 'executive' elif 'Speaker' in action: actor = 'legislature' else: actor = 'legislature' action_type = self.action_types(action) bill.add_action( action, date.strftime('%Y-%m-%d'), chamber=actor, classification=action_type, ) # Were in reverse chronological order. bill.actions.reverse() # Grabs bill version documents. version_links = self.get_nodes( bill_page, '//div[@class="main-content"]/div[3]/div[2]/' 'div[@class="hidden-xs"]/ul[1]/li/a') for version_link in version_links: version_name = version_link.text version_url = version_link.attrib['href'] # replace Current w/ session number version_url = version_url.replace('Current', session) bill.add_version_link(version_name, version_url, media_type='application/pdf') # Adds any documents related to amendments. amendment_links = self.get_nodes( bill_page, '//div[@class="main-content"]/div[5]/div[2]/table/tr/td[1]/a') for amendment_link in amendment_links: amendment_name = amendment_link.text amendment_url = amendment_link.attrib['href'] bill.add_document_link(amendment_name, amendment_url) # Related transcripts. transcript_links = self.get_nodes( bill_page, '//div[@class="main-content"]/div[5]/div[2]/' 'div[@class="hidden-xs"]/table/tr/td/a') for transcript_link in transcript_links: transcript_name = transcript_link.text transcript_url = transcript_link.attrib['href'] bill.add_document_link(transcript_name, transcript_url) return bill
def scrape_bills(self, chamber_to_scrape, session): url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session bill_dir_page = self.get(url) root = lxml.etree.fromstring(bill_dir_page.content) for mr in root.xpath('//LASTACTION/MSRGROUP'): bill_id = mr.xpath('string(MEASURE)').replace(" ", "") if bill_id[0] == "S": chamber = "upper" else: chamber = "lower" bill_type = {'B': 'bill', 'C': 'concurrent resolution', 'R': 'resolution', 'N': 'nomination'}[bill_id[1]] # just skip past bills that are of the wrong chamber if chamber != chamber_to_scrape: continue link = mr.xpath('string(ACTIONLINK)').replace("..", "") main_doc = mr.xpath('string(MEASURELINK)').replace("../../../", "") main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf%s' % (session, link) try: details_page = self.get(bill_details_url) except scrapelib.HTTPError: self.warning('Bill page not loading for {}; skipping'.format(bill_id)) continue page = details_page.content # Some pages have the (invalid) byte 11 sitting around. Just drop # them out. Might as well. details_root = lxml.etree.fromstring(page) title = details_root.xpath('string(//SHORTTITLE)') longtitle = details_root.xpath('string(//LONGTITLE)') bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.extras['summary'] = longtitle bill.add_source(main_doc_url) # sponsors main_sponsor = details_root.xpath('string(//P_NAME)').split() if main_sponsor: main_sponsor = main_sponsor[0] main_sponsor_link = details_root.xpath('string(//P_LINK)').replace(" ", "_") main_sponsor_url = ('http://billstatus.ls.state.ms.us/%s/' 'pdf/%s') % (session, main_sponsor_link.strip('../')) type = "primary" bill.add_source(main_sponsor_url) bill.add_sponsorship(main_sponsor, classification=type, entity_type='person', primary=True) for author in details_root.xpath('//AUTHORS/ADDITIONAL'): leg = author.xpath('string(CO_NAME)').replace(" ", "_") if leg: leg_url = ('http://billstatus.ls.state.ms.us/%s/' 'pdf/House_authors/%s.xml') % (session, leg) type = "cosponsor" bill.add_source(leg_url) bill.add_sponsorship(leg, classification=type, entity_type='person', primary=False ) # Versions curr_version = details_root.xpath('string(//CURRENT_OTHER' ')').replace("../../../../", "") if curr_version != "": curr_version_url = "http://billstatus.ls.state.ms.us/" \ + curr_version bill.add_version_link("Current version", curr_version_url, on_duplicate="ignore", media_type="text/html" ) intro_version = details_root.xpath('string(//INTRO_OTHER)').replace("../../../../", "") if intro_version != "": intro_version_url = "http://billstatus.ls.state.ms.us/"\ + intro_version bill.add_version_link("As Introduced", intro_version_url, on_duplicate='ignore', media_type='text/html') comm_version = details_root.xpath('string(//CMTESUB_OTHER' ')').replace("../../../../", "") if comm_version.find("documents") != -1: comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version bill.add_version_link("Committee Substitute", comm_version_url, on_duplicate='ignore', media_type='text/html') passed_version = details_root.xpath('string(//PASSED_OTHER' ')').replace("../../../../", "") if passed_version.find("documents") != -1: passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version title = "As Passed the " + chamber bill.add_version_link(title, passed_version_url, on_duplicate='ignore', media_type='text/html') asg_version = details_root.xpath('string(//ASG_OTHER)').replace("../../../../", "") if asg_version.find("documents") != -1: asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version bill.add_version_link("Approved by the Governor", asg_version_url, on_duplicate='ignore', media_type='text/html') # amendments # ex: http://billstatus.ls.state.ms.us/2018/pdf/history/HB/HB1040.xml for amd in details_root.xpath('//AMENDMENTS/*'): if amd.tag == 'HAM': name = amd.xpath('HAM_DESC[1]/text()')[0] name = append_parens(amd, 'HAM_DISP', name) name = append_parens(amd, 'HAM_VDESC', name) pdf_url = amd.xpath('string(HAM_PDF' ')').replace("../", "") html_url = amd.xpath('string(HAM_OTHER' ')').replace("../", "") elif amd.tag == 'SAM': name = amd.xpath('SAM_DESC[1]/text()')[0] name = append_parens(amd, 'SAM_DISP', name) name = append_parens(amd, 'SAM_VDESC', name) pdf_url = amd.xpath('string(SAM_PDF' ')').replace("../", "") html_url = amd.xpath('string(SAM_OTHER' ')').replace("../", "") elif amd.tag == 'AMRPT': name = amd.xpath('AMRPT_DESC[1]/text()')[0] pdf_url = amd.xpath('string(AMRPT_PDF' ')').replace("../", "") html_url = amd.xpath('string(AMRPT_OTHER' ')').replace("../", "") pdf_url = 'http://billstatus.ls.state.ms.us/' + pdf_url html_url = 'http://billstatus.ls.state.ms.us/' + html_url if 'adopted' in name.lower() or 'amendment report' in name.lower(): bill.add_version_link(name, pdf_url, on_duplicate='ignore', media_type='application/pdf') bill.add_version_link(name, html_url, on_duplicate='ignore', media_type='text/html') # avoid duplicate votes seen_votes = set() # Actions for action in details_root.xpath('//HISTORY/ACTION'): # action_num = action.xpath('string(ACT_NUMBER)').strip() # action_num = int(action_num) act_vote = action.xpath('string(ACT_VOTE)').replace("../../../..", "") action_desc = action.xpath('string(ACT_DESC)') date, action_desc = action_desc.split(" ", 1) date = date + "/" + session[0:4] date = datetime.strptime(date, "%m/%d/%Y") if action_desc.startswith("(H)"): actor = "lower" action = action_desc[4:] elif action_desc.startswith("(S)"): actor = "upper" action = action_desc[4:] else: actor = "executive" action = action_desc if "Veto" in action and actor == 'executive': version_path = details_root.xpath("string(//VETO_OTHER)") version_path = version_path.replace("../../../../", "") version_url = "http://billstatus.ls.state.ms.us/" + version_path bill.add_document_link("Veto", version_url) atype = 'other' for prefix, prefix_type in self._action_types: if action.startswith(prefix): atype = prefix_type break bill.add_action(action, self._tz.localize(date), chamber=actor, classification=atype if atype != 'other' else None) # use committee names as scraped subjects subjects = details_root.xpath('//H_NAME/text()') subjects += details_root.xpath('//S_NAME/text()') for subject in subjects: if subject not in bill.subject: bill.add_subject(subject) if act_vote: vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote if vote_url not in seen_votes: seen_votes.add(vote_url) yield from self.scrape_votes(vote_url, action, date, actor, bill) bill.add_source(bill_details_url) yield bill
def scrape_bill(self, chamber, session, bill_id, bill_type, url): doc = lxml.html.fromstring(self.get(url).text) doc.make_links_absolute(url) title = doc.xpath('//b[text()="TITLE:"]') if title: title = title[0].tail.strip().strip('"') else: self.warning("skipping bill %s, no information" % url) return bill = Bill( bill_id, title=title, chamber=chamber, classification=bill_type, legislative_session=session, ) bill.add_source(url) # Get sponsors spons_str = doc.xpath('//b[contains(text(), "SPONSOR")]')[0].tail.strip() sponsors_match = re.match( '(SENATOR|REPRESENTATIVE)\([Ss]\) ([^,]+(,[^,]+){0,})', spons_str) if sponsors_match: sponsors = sponsors_match.group(2).split(',') sponsor = sponsors[0].strip() if sponsor: bill.add_sponsorship( sponsors[0], entity_type='person', classification='primary', primary=True, ) for sponsor in sponsors[1:]: sponsor = sponsor.strip() if sponsor: bill.add_sponsorship( sponsor, entity_type='person', classification='cosponsor', primary=False, ) else: # Committee sponsorship spons_str = spons_str.strip() if re.match(r' BY REQUEST OF THE GOVERNOR$', spons_str): spons_str = re.sub(r' BY REQUEST OF THE GOVERNOR$', '', spons_str).title() spons_str = (spons_str + " Committee (by request of the governor)") if spons_str: bill.add_sponsorship( spons_str, entity_type='person', classification='primary', primary=True, ) # Get actions from second myth table self._current_comm = None act_rows = doc.xpath('(//table[@class="myth"])[2]//tr')[1:] for row in act_rows: date, journal, raw_chamber, action = row.xpath('td') act_date = datetime.datetime.strptime(date.text_content().strip(), '%m/%d/%y') raw_chamber = raw_chamber.text_content().strip() action = action.text_content().strip() if raw_chamber == "(H)": act_chamber = "lower" elif raw_chamber == "(S)": act_chamber = "upper" if re.match("\w+ Y(\d+)", action): vote_href = journal.xpath('.//a/@href') if vote_href: yield from self.parse_vote(bill, action, act_chamber, act_date, vote_href[0]) action, atype = self.clean_action(action) match = re.match('^Prefile released (\d+/\d+/\d+)$', action) if match: action = 'Prefile released' act_date = datetime.datetime.strptime(match.group(1), '%m/%d/%y') bill.add_action( action, chamber=act_chamber, date=act_date.strftime('%Y-%m-%d'), classification=atype) # Get subjects for subj in doc.xpath('//a[contains(@href, "subject")]/text()'): bill.add_subject(subj.strip()) # Get versions text_list_url = ( "http://www.legis.state.ak.us/" "basis/get_fulltext.asp?session=%s&bill=%s" ) % (session, bill_id) bill.add_source(text_list_url) text_doc = lxml.html.fromstring(self.get(text_list_url).text) text_doc.make_links_absolute(text_list_url) for link in text_doc.xpath('//a[contains(@href, "get_bill_text")]'): name = link.xpath('../preceding-sibling::td/text()')[0].strip() text_url = link.get('href') bill.add_version_link(name, text_url, media_type="text/html") # Get documents doc_list_url = ( "http://www.legis.state.ak.us/" "basis/get_documents.asp?session=%s&bill=%s" ) % (session, bill_id) doc_list = lxml.html.fromstring(self.get(doc_list_url).text) doc_list.make_links_absolute(doc_list_url) bill.add_source(doc_list_url) for href in doc_list.xpath('//a[contains(@href, "get_documents")][@onclick]'): h_name = href.text_content() h_href = href.attrib['href'] if h_name.strip(): bill.add_document_link(h_name, h_href) yield bill
def scrape_bill(self, chamber, session, bill_id, url): page = self.lxmlize(url) (header, ) = page.xpath('//h3[@class="heading"]/text()') title = header.replace(bill_id, "").strip() if '.B. ' in bill_id: bill_type = 'bill' elif bill_id.startswith('H.R. ') or bill_id.startswith('S.R. '): bill_type = 'resolution' elif '.C.R. ' in bill_id: bill_type = 'concurrent resolution' elif '.J.R. ' in bill_id: bill_type = 'joint resolution' for flag in SUB_BLACKLIST: if flag in bill_id: bill_id = bill_id.replace(flag, " ") bill_id = re.sub(r"\s+", " ", bill_id).strip() bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(url) primary_info = page.xpath('//div[@id="billsponsordiv"]') for info in primary_info: try: (title, name) = [ x.strip() for x in info.xpath('.//text()') if x.strip() ] except ValueError: self.warning( "Could not find sponsor's name for {}".format(bill_id)) continue assert title == "Bill Sponsor:" name = name.replace("Sen. ", "").replace("Rep. ", "") bill.add_sponsorship(name, classification='primary', entity_type='person', primary=True) floor_info = page.xpath('//div[@id="floorsponsordiv"]//text()') floor_info = [x.strip() for x in floor_info if x.strip()] if len(floor_info) in (0, 1): # This indicates that no floor sponsor was found pass elif len(floor_info) == 2: assert floor_info[0] == "Floor Sponsor:" floor_sponsor = floor_info[1].replace("Sen. ", "").replace("Rep. ", "") bill.add_sponsorship(floor_sponsor, classification='cosponsor', entity_type='person', primary=False) else: raise AssertionError("Unexpected floor sponsor HTML found") versions = page.xpath( '//b[text()="Bill Text"]/following-sibling::ul/li/' 'a[text() and not(text()=" ")]') for version in versions: # sometimes the href is on the following <a> tag and the tag we # have has an onclick url = version.get('href') if not url: url = version.xpath('following-sibling::a[1]/@href')[0] bill.add_version_link(version.xpath('text()')[0].strip(), url, media_type='application/pdf') for related in page.xpath( '//b[text()="Related Documents "]/following-sibling::ul/li/' 'a[contains(@class,"nlink")]'): href = related.xpath('@href')[0] if '.fn.pdf' in href: bill.add_document_link("Fiscal Note", href, media_type='application/pdf') else: text = related.xpath('text()')[0] bill.add_document_link(text, href, media_type='application/pdf') subjects = [] for link in page.xpath("//a[contains(@href, 'RelatedBill')]"): subjects.append(link.text.strip()) bill.subject = subjects if page.xpath('//div[@id="billStatus"]//table'): status_table = page.xpath('//div[@id="billStatus"]//table')[0] yield from self.parse_status(bill, status_table, chamber) yield bill
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.get(url).text) except scrapelib.HTTPError as e: self.warning('error (%s) fetching %s, skipping' % (e, url)) return title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if 'JR' in bill_id: bill_type = ['joint resolution'] elif 'CR' in bill_id: bill_type = ['concurrent resolution'] elif 'R' in bill_id: bill_type = ['resolution'] else: bill_type = ['bill'] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(url) bill.subject = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if ':' in name: raise Exception(name) if 'otherAuth' in link.attrib['id']: bill.add_sponsorship(name, classification='cosponsor', entity_type='person', primary=False) else: bill.add_sponsorship(name, classification='primary', entity_type='person', primary=True) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == 'None': continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == 'H': actor = 'lower' elif actor == 'S': actor = 'upper' attrs = self.categorizer.categorize(action) related_entities = [] for item in attrs['committees']: related_entities.append({ 'type': 'committee', 'name': item }) for item in attrs['legislators']: related_entities.append({ 'type': 'legislator', 'name': item }) bill.add_action(description=action, date=date.strftime('%Y-%m-%d'), chamber=actor, classification=attrs['classification'], related_entities=related_entities) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] # Keep track of already seen versions to prevent processing duplicates. version_urls = [] for link in version_table.xpath(".//a[contains(@href, '.PDF')]"): version_url = link.attrib['href'] if version_url in version_urls: self.warning('Skipping duplicate version URL.') continue else: version_urls.append(version_url) name = link.text.strip() if re.search('COMMITTEE REPORTS|SCHEDULED CCR', version_url, re.IGNORECASE): bill.add_document_link(note=name, url=version_url, media_type='application/pdf') continue bill.add_version_link(note=name, url=version_url, media_type='application/pdf') for link in page.xpath(".//a[contains(@href, '_VOTES')]"): if 'HT_' not in link.attrib['href']: yield from self.scrape_votes(bill, self.urlescape(link.attrib['href'])) # # If the bill has no actions and no versions, it's a bogus bill on # # their website, which appears to happen occasionally. Skip. has_no_title = (bill.title == "Short Title Not Found.") if has_no_title: # If there's no title, this is an empty page. Skip! return else: # Otherwise, save the bills. yield bill
def get_bill(self, matter): '''Make Bill object from given matter.''' ''' Currently, NYC Legistar does not have conventional "Types" for three newly added committees: https://legistar.council.nyc.gov/Departments.aspx We communicated the issue to NYC, and until we learn more, we will skip the bills attached to those committees. ''' orgs_without_type = [ 'Charter Revision Commission 2019', 'New York City Advisory Commission on Property Tax Reform', 'Democratic Conference of the Council of the City of New York' ] if matter['MatterBodyName'].strip() in orgs_without_type: return None matter_id = matter['MatterId'] if matter_id in DUPLICATED_ACTIONS: return None date = matter['MatterIntroDate'] title = matter['MatterName'] identifier = matter['MatterFile'] if not all((date, title, identifier)): return None leg_type = BILL_TYPES[matter['MatterTypeName']] bill_session = self.sessions(self.toTime(date)) bill = Bill(identifier=identifier, title=title, classification=leg_type, legislative_session=bill_session, from_organization={"name": "New York City Council"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') if matter['MatterTitle']: bill.add_title(matter['MatterTitle']) if matter['MatterEXText5']: bill.add_abstract(matter['MatterEXText5'], note='') try: for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) except KeyError: self.version_errors.append(legistar_web) return None for attachment in self.attachments(matter_id): if attachment['MatterAttachmentId'] == 103315: # Duplicate return None if attachment['MatterAttachmentName']: bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type='application/pdf') for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: return None else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') try: text = self.text(matter_id) except KeyError: self.version_errors.append(legistar_web) return None bill.extras['local_classification'] = matter['MatterTypeName'] if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'].replace( u'\u0000', '') if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') return bill
def scrape_bill(self, bill_id): old = self.api('bills/' + bill_id + '?') # not needed old.pop('id') old.pop('state') old.pop('level', None) old.pop('country', None) old.pop('created_at') old.pop('updated_at') old.pop('action_dates') old.pop('+bill_type',None) old.pop('+subject', None) old.pop('+scraped_subjects', None) old.pop('subjects', []) classification = old.pop('type') # ca weirdness if 'fiscal committee' in classification: classification.remove('fiscal committee') if 'urgency' in classification: classification.remove('urgency') if 'local program' in classification: classification.remove('local program') if 'tax levy' in classification: classification.remove('tax levy') if classification[0] in ['miscellaneous', 'jres', 'cres']: return if classification == ['memorial resolution'] and self.state == 'ar': classification = ['memorial'] if classification == ['concurrent memorial resolution'] and self.state == 'ar': classification = ['concurrent memorial'] if classification == ['joint session resolution'] and self.state == 'il': classification = ['joint resolution'] if classification == ['legislative resolution'] and self.state == 'ny': classification = ['resolution'] if classification == ['address'] and self.state == 'nh': classification = ['resolution'] if not old['title'] and self.state == 'me': old['title'] = '(unknown)' chamber = old.pop('chamber') if self.state in ('ne', 'dc'): chamber = 'legislature' elif chamber in ('joint', 'conference'): chamber = 'legislature' new = Bill(old.pop('bill_id'), old.pop('session'), old.pop('title'), chamber=chamber, classification=classification) abstract = old.pop('summary', None) if abstract: new.add_abstract(abstract, note='') for title in old.pop('alternate_titles'): new.add_title(title) for doc in old.pop('documents'): new.add_document_link(doc['name'], doc['url'], on_duplicate='ignore') for doc in old.pop('versions'): new.add_version_link(doc['name'], doc['url'], media_type=doc.pop('mimetype', '')) for subj in old.pop('scraped_subjects', []): if subj: new.add_subject(subj) for spon in old.pop('sponsors'): if spon.get('committee_id') is not None: entity_type = 'organization' elif spon.get('leg_id') is not None: entity_type = 'person' else: entity_type = '' new.add_sponsorship(spon['name'], spon['type'], entity_type, spon['type'] == 'primary') for act in old.pop('actions'): actor = act['actor'] if actor.lower() in ('governor', 'mayor', 'secretary of state'): actor = 'executive' elif actor.lower() == 'house' or (actor.lower().startswith('lower (') and self.state == 'ca'): actor = 'lower' elif actor.lower() in ('senate', 'upper`') or (actor.lower().startswith('upper (') and self.state == 'ca'): actor = 'upper' elif actor in ('joint', 'other', 'Data Systems', 'Speaker', 'clerk', 'Office of the Legislative Fiscal Analyst', 'Became Law w', 'conference') or (actor.lower().startswith('legislature (') and self.state == 'ca'): actor = 'legislature' if actor in ('committee', 'sponsor') and self.state == 'pr': actor = 'legislature' # nebraska & DC if actor in ('upper','council') and self.state in ('ne', 'dc'): actor = 'legislature' if act['action']: newact = new.add_action(act['action'], act['date'][:10], chamber=actor, classification=[action_types[c] for c in act['type'] if c != 'other']) for re in act.get('related_entities', []): if re['type'] == 'committee': re['type'] = 'organization' elif re['type'] == 'legislator': re['type'] = 'person' newact.add_related_entity(re['name'], re['type']) for comp in old.pop('companions', []): if self.state in ('nj', 'ny', 'mn'): rtype = 'companion' new.add_related_bill(comp['bill_id'], comp['session'], rtype) for abid in old.pop('alternate_bill_ids', []) + old.pop('+alternate_bill_ids', []): new.add_identifier(abid) # generic OpenStates stuff for id in old.pop('all_ids'): new.add_identifier(id, scheme='openstates') for source in old.pop('sources'): source.pop('retrieved', None) new.add_source(**source) ext_title = old.pop('+extended_title', None) if ext_title: new.add_title(ext_title, note='Extended Title') official_title = old.pop('+official_title', None) if official_title: new.add_title(official_title, note='Official Title') to_extras = ['+status', '+final_disposition', '+volume_chapter', '+ld_number', '+referral', '+companion', '+description', '+fiscal_note_probable:', '+preintroduction_required:', '+drafter', '+category:', '+chapter', '+requester', '+transmittal_date:', '+by_request_of', '+bill_draft_number:', '+bill_lr', '+bill_url', '+rcs_num', '+fiscal_note', '+impact_clause', '+fiscal_notes', '+short_title', '+type_', '+conference_committee', 'conference_committee', '+companion_bill_ids', '+additional_information'] for k in to_extras: v = old.pop(k, None) if v: new.extras[k.replace('+', '')] = v # votes vote_no = 1 for vote in old.pop('votes'): vote.pop('id') vote.pop('state') vote.pop('bill_id') vote.pop('bill_chamber', None) vote.pop('+state', None) vote.pop('+country', None) vote.pop('+level', None) vote.pop('+vacant', None) vote.pop('+not_voting', None) vote.pop('+amended', None) vote.pop('+excused', None) vote.pop('+NV', None) vote.pop('+AB', None) vote.pop('+P', None) vote.pop('+V', None) vote.pop('+E', None) vote.pop('+EXC', None) vote.pop('+EMER', None) vote.pop('+present', None) vote.pop('+absent', None) vote.pop('+seconded', None) vote.pop('+moved', None) vote.pop('+vote_type', None) vote.pop('+actual_vote', None) vote.pop('+skip_votes', None) vote.pop('vote_id') vote.pop('+bill_chamber', None) vote.pop('+session', None) vote.pop('+bill_id', None) vote.pop('+bill_session', None) vote.pop('committee', None) vote.pop('committee_id', None) vtype = vote.pop('type', 'passage') if vtype == 'veto_override': vtype = ['veto-override'] elif vtype == 'amendment': vtype = ['amendment-passage'] elif vtype == 'other': vtype = '' else: vtype = ['bill-passage'] # most states need identifiers for uniqueness, just do it everywhere identifier = vote['date'] + '-' + str(vote_no) vote_no += 1 chamber = vote.pop('chamber') if chamber == 'upper' and self.state in ('ne', 'dc'): chamber = 'legislature' elif chamber == 'joint': chamber = 'legislature' newvote = VoteEvent(legislative_session=vote.pop('session'), motion_text=vote.pop('motion'), result='pass' if vote.pop('passed') else 'fail', chamber=chamber, start_date=vote.pop('date'), classification=vtype, bill=new, identifier=identifier) for vt in ('yes', 'no', 'other'): newvote.set_count(vt, vote.pop(vt + '_count')) for name in vote.pop(vt + '_votes'): newvote.vote(vt, name['name']) for source in vote.pop('sources'): source.pop('retrieved', None) newvote.add_source(**source) if not newvote.sources: newvote.sources = new.sources to_extras = ['+record', '+method', 'method', '+filename', 'record', '+action', '+location', '+rcs_num', '+type_', '+threshold', '+other_vote_detail', '+voice_vote'] for k in to_extras: v = vote.pop(k, None) if v: newvote.extras[k.replace('+', '')] = v assert not vote, vote.keys() yield newvote assert not old, old.keys() yield new
def scrape_bills(self, chamber_to_scrape, session): url = 'http://billstatus.ls.state.ms.us/%s/pdf/all_measures/allmsrs.xml' % session bill_dir_page = self.get(url) root = lxml.etree.fromstring(bill_dir_page.content) for mr in root.xpath('//LASTACTION/MSRGROUP'): bill_id = mr.xpath('string(MEASURE)').replace(" ", "") if bill_id[0] == "S": chamber = "upper" else: chamber = "lower" bill_type = { 'B': 'bill', 'C': 'concurrent resolution', 'R': 'resolution', 'N': 'nomination' }[bill_id[1]] # just skip past bills that are of the wrong chamber if chamber != chamber_to_scrape: continue link = mr.xpath('string(ACTIONLINK)').replace("..", "") main_doc = mr.xpath('string(MEASURELINK)').replace("../../../", "") main_doc_url = 'http://billstatus.ls.state.ms.us/%s' % main_doc bill_details_url = 'http://billstatus.ls.state.ms.us/%s/pdf/%s' % ( session, link) try: details_page = self.get(bill_details_url) except scrapelib.HTTPError: self.warning( 'Bill page not loading for {}; skipping'.format(bill_id)) continue page = details_page.content # Some pages have the (invalid) byte 11 sitting around. Just drop # them out. Might as well. details_root = lxml.etree.fromstring(page) title = details_root.xpath('string(//SHORTTITLE)') longtitle = details_root.xpath('string(//LONGTITLE)') bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.extras['summary'] = longtitle bill.add_source(main_doc_url) # sponsors main_sponsor = details_root.xpath('string(//P_NAME)').split() if main_sponsor: main_sponsor = main_sponsor[0] main_sponsor_link = details_root.xpath( 'string(//P_LINK)').replace(" ", "_") main_sponsor_url = ( 'http://billstatus.ls.state.ms.us/%s/' 'pdf/%s') % (session, main_sponsor_link.strip('../')) type = "primary" bill.add_source(main_sponsor_url) bill.add_sponsorship(main_sponsor, classification=type, entity_type='person', primary=True) for author in details_root.xpath('//AUTHORS/ADDITIONAL'): leg = author.xpath('string(CO_NAME)').replace(" ", "_") if leg: leg_url = ('http://billstatus.ls.state.ms.us/%s/' 'pdf/House_authors/%s.xml') % (session, leg) type = "cosponsor" bill.add_source(leg_url) bill.add_sponsorship(leg, classification=type, entity_type='person', primary=False) # Versions curr_version = details_root.xpath('string(//CURRENT_OTHER' ')').replace("../../../../", "") if curr_version != "": curr_version_url = "http://billstatus.ls.state.ms.us/" \ + curr_version bill.add_version_link("Current version", curr_version_url, on_duplicate="ignore", media_type="text/html") intro_version = details_root.xpath( 'string(//INTRO_OTHER)').replace("../../../../", "") if intro_version != "": intro_version_url = "http://billstatus.ls.state.ms.us/"\ + intro_version bill.add_version_link("As Introduced", intro_version_url, on_duplicate='ignore', media_type='text/html') comm_version = details_root.xpath('string(//CMTESUB_OTHER' ')').replace("../../../../", "") if comm_version.find("documents") != -1: comm_version_url = "http://billstatus.ls.state.ms.us/" + comm_version bill.add_version_link("Committee Substitute", comm_version_url, on_duplicate='ignore', media_type='text/html') passed_version = details_root.xpath('string(//PASSED_OTHER' ')').replace( "../../../../", "") if passed_version.find("documents") != -1: passed_version_url = "http://billstatus.ls.state.ms.us/" + passed_version title = "As Passed the " + chamber bill.add_version_link(title, passed_version_url, on_duplicate='ignore', media_type='text/html') asg_version = details_root.xpath('string(//ASG_OTHER)').replace( "../../../../", "") if asg_version.find("documents") != -1: asg_version_url = "http://billstatus.ls.state.ms.us/" + asg_version bill.add_version_link("Approved by the Governor", asg_version_url, on_duplicate='ignore', media_type='text/html') # avoid duplicate votes seen_votes = set() # Actions for action in details_root.xpath('//HISTORY/ACTION'): # action_num = action.xpath('string(ACT_NUMBER)').strip() # action_num = int(action_num) act_vote = action.xpath('string(ACT_VOTE)').replace( "../../../..", "") action_desc = action.xpath('string(ACT_DESC)') date, action_desc = action_desc.split(" ", 1) date = date + "/" + session[0:4] date = datetime.strptime(date, "%m/%d/%Y") if action_desc.startswith("(H)"): actor = "lower" action = action_desc[4:] elif action_desc.startswith("(S)"): actor = "upper" action = action_desc[4:] else: actor = "executive" action = action_desc if "Veto" in action and actor == 'executive': version_path = details_root.xpath("string(//VETO_OTHER)") version_path = version_path.replace("../../../../", "") version_url = "http://billstatus.ls.state.ms.us/" + version_path bill.add_document_link("Veto", version_url) atype = 'other' for prefix, prefix_type in self._action_types: if action.startswith(prefix): atype = prefix_type break bill.add_action( action, self._tz.localize(date), chamber=actor, classification=atype if atype is not 'other' else None) # use committee names as scraped subjects subjects = details_root.xpath('//H_NAME/text()') subjects += details_root.xpath('//S_NAME/text()') for subject in subjects: if subject not in bill.subject: bill.add_subject(subject) if act_vote: vote_url = 'http://billstatus.ls.state.ms.us%s' % act_vote if vote_url not in seen_votes: seen_votes.add(vote_url) yield from self.scrape_votes(vote_url, action, date, actor, bill) bill.add_source(bill_details_url) yield bill
def scrape(self): for leg_summary in self.legislation(created_after=datetime.datetime(2014, 1, 1)) : leg_type = BILL_TYPES[leg_summary['Type']] bill = Bill(identifier=leg_summary['File\xa0#'], title=leg_summary['Title'], legislative_session=None, classification=leg_type, from_organization={"name":"New York City Council"}) bill.add_source(leg_summary['url']) leg_details = self.legDetails(leg_summary['url']) history = self.history(leg_summary['url']) bill.add_title(leg_details['Name'], note='created by administrative staff') if 'Summary' in leg_details : bill.add_abstract(leg_details['Summary'], note='') if leg_details['Law number'] : bill.add_identifier(leg_details['Law number'], note='law number') for sponsorship in self._sponsors(leg_details.get('Sponsors', [])) : sponsor, sponsorship_type, primary = sponsorship bill.add_sponsorship(sponsor, sponsorship_type, 'person', primary, entity_id = make_pseudo_id(name=sponsor)) for attachment in leg_details.get('Attachments', []) : bill.add_document_link(attachment['label'], attachment['url'], media_type="application/pdf") history = list(history) if history : earliest_action = min(self.toTime(action['Date']) for action in history) bill.legislative_session = self.sessions(earliest_action) else : bill.legislative_session = str(self.SESSION_STARTS[0]) for action in history : action_description = action['Action'] if not action_description : continue action_class = ACTION_CLASSIFICATION[action_description] action_date = self.toDate(action['Date']) responsible_org = action['Action\xa0By'] if responsible_org == 'City Council' : responsible_org = 'New York City Council' elif responsible_org == 'Administration' : responsible_org = 'Mayor' if responsible_org == 'Town Hall Meeting' : continue else : act = bill.add_action(action_description, action_date, organization={'name': responsible_org}, classification=action_class) if 'url' in action['Action\xa0Details'] : action_detail_url = action['Action\xa0Details']['url'] if action_class == 'committee-referral' : action_details = self.actionDetails(action_detail_url) referred_committee = action_details['Action text'].rsplit(' to the ', 1)[-1] act.add_related_entity(referred_committee, 'organization', entity_id = make_pseudo_id(name=referred_committee)) result, votes = self.extractVotes(action_detail_url) if votes : action_vote = VoteEvent(legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=action_class, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes : action_vote.vote(option, voter) yield action_vote text = self.text(leg_summary['url']) if text : bill.extras = {'local_classification' : leg_summary['Type'], 'full_text' : text} else : bill.extras = {'local_classification' : leg_summary['Type']} yield bill
def parse_bill(self, chamber, session, bill_id, url): try: page = self.lxmlize(url) except scrapelib.HTTPError as e: self.logger.warning(e) return last_action = self.parse_bill_field(page, "Last Action").xpath("text()")[0] if "WITHDRAWN" in last_action.upper(): self.info("{} Withdrawn, skipping".format(bill_id)) return version = self.parse_bill_field(page, "Bill Documents") source_url = version.xpath("a[1]/@href")[0] version_title = version.xpath("a[1]/text()")[0].strip() if version is None: # Bill withdrawn self.logger.warning("Bill withdrawn.") return else: if source_url.endswith(".doc"): mimetype = "application/msword" elif source_url.endswith(".pdf"): mimetype = "application/pdf" title = self.parse_bill_field(page, "Title").text_content() # actions = self.get_nodes( # page, # '//div[@class="StandardText leftDivMargin"]/' # 'div[@class="StandardText"][last()]//text()[normalize-space()]') if "CR" in bill_id: bill_type = "concurrent resolution" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" else: bill_type = "bill" bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.subject = self._subjects[bill_id] bill.add_source(url) bill.add_version_link(version_title, source_url, media_type=mimetype) self.parse_actions(page, bill, chamber) self.parse_subjects(page, bill) # LM is "Locally Mandated fiscal impact" fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]') for fiscal_note in fiscal_notes: source_url = fiscal_note.attrib["href"] if source_url.endswith(".doc"): mimetype = "application/msword" elif source_url.endswith(".pdf"): mimetype = "application/pdf" bill.add_document_link("Fiscal Note", source_url, media_type=mimetype) for link in page.xpath( "//td/span/a[contains(@href, 'Legislator-Profile')]"): bill.add_sponsorship( link.text.strip(), classification="primary", entity_type="person", primary=True, ) bdr_no = self.parse_bill_field(page, "Bill Request Number") if bdr_no.xpath("text()"): bdr = bdr_no.xpath("text()")[0].strip() bill.extras["BDR"] = bdr yield bill
def scrape_assem_bills(self, chamber, insert, session, year): doc_type = {1: 'bill', 3: 'resolution', 5: 'concurrent resolution', 6: 'joint resolution', 9: 'petition'} for docnum, bill_type in doc_type.items(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/' \ 'Reports/HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count = count + 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) page = self.get(page_path).text page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) root.make_links_absolute("http://www.leg.state.nv.us/") bill_id = root.xpath('string(/html/body/div[@id="content"]' '/table[1]/tr[1]/td[1]/font)') title = self.get_node( root, '//div[@id="content"]/table/tr[preceding-sibling::tr/td/' 'b[contains(text(), "By:")]]/td/em/text()') bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.subject = list(set(self.subject_mapping[bill_id])) billtext = root.xpath("//b[text()='Bill Text']")[0].getparent().getnext() text_urls = billtext.xpath("./a") for text_url in text_urls: version_name = text_url.text.strip() version_url = text_url.attrib['href'] bill.add_version_link(note=version_name, url=version_url, media_type='application/pdf') primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsorship(classification='primary', name=leg, entity_type='person', primary=True) for leg in secondary: bill.add_sponsorship(classification='cosponsor', name=leg, entity_type='person', primary=False) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Minutes" bill.add_document_link(note=minutes_date, url=minutes_url) minutes_count += 1 self.scrape_actions(root, bill, "lower") yield from self.scrape_votes(page, page_path, bill, insert, year) bill.add_source(page_path) yield bill
def scrape_senate_bills(self, chamber, insert, session, year): doc_type = {2: 'bill', 4: 'resolution', 7: 'concurrent resolution', 8: 'joint resolution'} for docnum, bill_type in doc_type.items(): parentpage_url = 'http://www.leg.state.nv.us/Session/%s/Reports/' \ 'HistListBills.cfm?DoctypeID=%s' % (insert, docnum) links = self.scrape_links(parentpage_url) count = 0 for link in links: count += 1 page_path = 'http://www.leg.state.nv.us/Session/%s/Reports/%s' % (insert, link) page = self.get(page_path).text page = page.replace(u"\xa0", " ") root = lxml.html.fromstring(page) bill_id = root.xpath('string(/html/body/div[@id="content"]' + '/table[1]/tr[1]/td[1]/font)') title = self.get_node( root, '//div[@id="content"]/table/tr[preceding-sibling::tr/td/' 'b[contains(text(), "By:")]]/td/em/text()') bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type ) bill.subject = list(set(self.subject_mapping[bill_id])) for table in root.xpath('//div[@id="content"]/table'): if 'Bill Text' in table.text_content(): bill_text = table.xpath("string(tr/td[2]/a/@href)") text_url = "http://www.leg.state.nv.us" + bill_text bill.add_version_link(note="Bill Text", url=text_url, media_type='application/pdf') primary, secondary = self.scrape_sponsors(page) for leg in primary: bill.add_sponsorship(name=leg, classification='primary', entity_type='person', primary=True) for leg in secondary: bill.add_sponsorship(name=leg, classification='cosponsor', entity_type='person', primary=False) minutes_count = 2 for mr in root.xpath('//table[4]/tr/td[3]/a'): minutes = mr.xpath("string(@href)") minutes_url = "http://www.leg.state.nv.us" + minutes minutes_date_path = "string(//table[4]/tr[%s]/td[2])" % minutes_count minutes_date = mr.xpath(minutes_date_path).split() minutes_date = minutes_date[0] + minutes_date[1] + minutes_date[2] + " Agenda" # bill.add_document(minutes_date, minutes_url) bill.add_document_link(note=minutes_date, url=minutes_url) minutes_count = minutes_count + 1 self.scrape_actions(root, bill, "upper") yield from self.scrape_votes(page, page_path, bill, insert, year) bill.add_source(page_path) yield bill
def scrape(self, window=28, matter_ids=None): '''By default, scrape board reports updated in the last 28 days. Optionally specify a larger or smaller window of time from which to scrape updates, or specific matters to scrape. Note that passing a value for :matter_ids supercedes the value of :window, such that the given matters will be scraped regardless of when they were updated. Optional parameters :window (numeric) - Amount of time for which to scrape updates, e.g. a window of 7 will scrape legislation updated in the last week. Pass a window of 0 to scrape all legislation. :matter_ids (str) - Comma-separated list of matter IDs to scrape ''' if matter_ids: matters = [ self.matter(matter_id) for matter_id in matter_ids.split(',') ] matters = filter( None, matters) # Skip matters that are not yet in Legistar elif float(window): # Support for partial days, i.e., window=0.15 n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) matters = self.matters(n_days_ago) else: # Scrape all matters, including those without a last-modified date matters = self.matters() n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for matter in matters: # Skip this bill, until Metro cleans up duplicate in Legistar API if matter['MatterFile'] == '2017-0447': continue matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)): continue # Do not scrape private bills introduced before this timestamp. if self._is_restricted(matter) and ( date < self.START_DATE_PRIVATE_SCRAPE): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Board of Directors"}) # The Metro scraper scrapes private bills. # However, we do not want to capture significant data about private bills, # other than the value of the helper function `_is_restricted` and a last modified timestamp. # We yield private bills early, wipe data from previously imported once-public bills, # and include only data *required* by the pupa schema. # https://github.com/opencivicdata/pupa/blob/master/pupa/scrape/schemas/bill.py bill.extras = {'restrict_view': self._is_restricted(matter)} # Add API source early. # Private bills should have this url for debugging. legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_api, note='api') if self._is_restricted(matter): # required fields bill.title = 'Restricted View' # wipe old data bill.extras['plain_text'] = '' bill.extras['rtf_text'] = '' bill.sponsorships = [] bill.related_bills = [] bill.versions = [] bill.documents = [] bill.actions = [] yield bill continue legistar_web = matter['legistar_url'] bill.add_source(legistar_web, note='web') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: try: raw_option = vote['VoteValueName'].lower() except AttributeError: raw_option = None clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill( identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link( 'Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report' .format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_document_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'].strip(), media_type="application/pdf") bill.extras['local_classification'] = matter['MatterTypeName'] matter_version_value = matter['MatterVersion'] text = self.text(matter_id, matter_version_value) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(" ", "")) bill_page = self.get(url).text html = lxml.html.fromstring(bill_page) html.make_links_absolute( "http://legislature.idaho.gov/legislation/%s/" % session) bill_tables = html.xpath('//table[contains(@class, "bill-table")]') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill( legislative_session=session, chamber=chamber, identifier=bill_id, title=title, classification=bill_type, ) bill.add_source(url) for subject in self._subjects[bill_id.replace(" ", "")]: bill.add_subject(subject) if short_title and title.lower() != short_title.lower(): bill.add_title(short_title, "short title") # documents doc_links = html.xpath('//div[contains(@class,"insert-page")]//a') for link in doc_links: name = link.text_content().strip() href = link.get("href") if "Engrossment" in name or "Bill Text" in name or "Amendment" in name: bill.add_version_link(note=name, url=href, media_type="application/pdf") else: bill.add_document_link(note=name, url=href, media_type="application/pdf") def _split(string): return re.split(r"\w+[,|AND]\s+", string) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split("by") if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: if "COMMITTEE" in sponsors.upper(): bill.add_sponsorship( name=sponsors.strip(), entity_type="organization", primary=True, classification="primary", ) else: for person in _split(sponsors): person = person.strip() if person != "": bill.add_sponsorship( classification="primary", name=person, entity_type="person", primary=True, ) actor = chamber last_date = None # if a bill has passed a chamber or been 'received from' # then the next committee passage is in the opposite chamber has_moved_chambers = False for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime(date + "/" + session[0:4], "%m/%d/%Y").strftime("%Y-%m-%d") if action.startswith("House"): actor = "lower" elif action.startswith("Senate"): actor = "upper" # votes if "AYES" in action or "NAYS" in action: yield from self.parse_vote(actor, date, row[2], session, bill_id, chamber, url) # bill.add_vote_event(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace(u"\xa0", " ").strip() atype = get_action(actor, action) if atype and "passage" in atype: has_moved_chambers = True if atype and "committee-passage" in atype and has_moved_chambers: actor = _OTHER_CHAMBERS[actor] bill.add_action(action, date, chamber=actor, classification=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if "to House" in action: actor = "lower" elif "to Senate" in action: actor = "upper" yield bill
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.get(url).text) except scrapelib.HTTPError as e: self.warning('error (%s) fetching %s, skipping' % (e, url)) return title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])").strip() if not title: self.warning('blank bill on %s - skipping', url) return if 'JR' in bill_id: bill_type = ['joint resolution'] elif 'CR' in bill_id: bill_type = ['concurrent resolution'] elif 'R' in bill_id: bill_type = ['resolution'] else: bill_type = ['bill'] bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.add_source(url) bill.subject = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if ':' in name: raise Exception(name) if 'otherAuth' in link.attrib['id']: bill.add_sponsorship(name, classification='cosponsor', entity_type='person', primary=False) else: bill.add_sponsorship(name, classification='primary', entity_type='person', primary=True) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == 'None': continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == 'H': actor = 'lower' elif actor == 'S': actor = 'upper' attrs = self.categorizer.categorize(action) related_entities = [] for item in attrs['committees']: related_entities.append({ 'type': 'committee', 'name': item }) for item in attrs['legislators']: related_entities.append({ 'type': 'legislator', 'name': item }) bill.add_action(description=action, date=date.strftime('%Y-%m-%d'), chamber=actor, classification=attrs['classification'], related_entities=related_entities) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] # Keep track of already seen versions to prevent processing duplicates. version_urls = [] for link in version_table.xpath(".//a[contains(@href, '.PDF')]"): version_url = link.attrib['href'] if version_url in version_urls: self.warning('Skipping duplicate version URL.') continue else: version_urls.append(version_url) name = link.text.strip() if re.search('COMMITTEE REPORTS|SCHEDULED CCR', version_url, re.IGNORECASE): bill.add_document_link(note=name, url=version_url, media_type='application/pdf') continue bill.add_version_link(note=name, url=version_url, media_type='application/pdf') for link in page.xpath(".//a[contains(@href, '_VOTES')]"): if 'HT_' not in link.attrib['href']: yield from self.scrape_votes(bill, self.urlescape(link.attrib['href'])) # # If the bill has no actions and no versions, it's a bogus bill on # # their website, which appears to happen occasionally. Skip. has_no_title = (bill.title == "Short Title Not Found.") if has_no_title: # If there's no title, this is an empty page. Skip! return else: # Otherwise, save the bills. yield bill
def parse_bill(self, chamber, session, bill_id, url): page = self.lxmlize(url) last_action = self.parse_bill_field(page, 'Last Action').xpath('text()')[0] if 'WITHDRAWN' in last_action.upper(): self.info("{} Withdrawn, skipping".format(bill_id)) return version = self.parse_bill_field(page, 'Bill Documents') source_url = version.xpath('a[1]/@href')[0] version_title = version.xpath('a[1]/text()')[0].strip() if version is None: # Bill withdrawn self.logger.warning('Bill withdrawn.') return else: if source_url.endswith('.doc'): mimetype = 'application/msword' elif source_url.endswith('.pdf'): mimetype = 'application/pdf' title = self.parse_bill_field(page, 'Title').text_content() # actions = self.get_nodes( # page, # '//div[@class="StandardText leftDivMargin"]/' # 'div[@class="StandardText"][last()]//text()[normalize-space()]') if 'CR' in bill_id: bill_type = 'concurrent resolution' elif 'JR' in bill_id: bill_type = 'joint resolution' elif 'R' in bill_id: bill_type = 'resolution' else: bill_type = 'bill' bill = Bill(bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type) bill.subject = self._subjects[bill_id] bill.add_source(url) bill.add_version_link(version_title, source_url, media_type=mimetype) self.parse_actions(page, bill, chamber) self.parse_subjects(page, bill) # LM is "Locally Mandated fiscal impact" fiscal_notes = page.xpath('//a[contains(@href, "/LM.pdf")]') for fiscal_note in fiscal_notes: source_url = fiscal_note.attrib['href'] if source_url.endswith('.doc'): mimetype = 'application/msword' elif source_url.endswith('.pdf'): mimetype = 'application/pdf' bill.add_document_link("Fiscal Note", source_url, media_type=mimetype) for link in page.xpath( "//td/span/a[contains(@href, 'Legislator-Profile')]"): bill.add_sponsorship(link.text.strip(), classification='primary', entity_type='person', primary=True) bdr_no = self.parse_bill_field(page, 'Bill Request Number') if bdr_no.xpath('text()'): bdr = bdr_no.xpath('text()')[0].strip() bill.extras["BDR"] = bdr yield bill
def scrape_bill(self, session, bill_url): page = self.get(bill_url).text page = lxml.html.fromstring(page) page.make_links_absolute(bill_url) try: bill_id = page.xpath('//span[@id="lblBillNumber"]/a[1]')[0].text except IndexError: self.logger.warning("Something is wrong with bill page, skipping.") return secondary_bill_id = page.xpath('//span[@id="lblCompNumber"]/a[1]') # checking if there is a matching bill if secondary_bill_id: secondary_bill_id = secondary_bill_id[0].text # swap ids if * is in secondary_bill_id if "*" in secondary_bill_id: bill_id, secondary_bill_id = secondary_bill_id, bill_id secondary_bill_id = secondary_bill_id.strip() secondary_bill_id = secondary_bill_id.replace(" ", " ") bill_id = bill_id.replace("*", "").replace(" ", " ").strip() if "B" in bill_id: bill_type = "bill" elif "JR" in bill_id: bill_type = "joint resolution" elif "R" in bill_id: bill_type = "resolution" primary_chamber = "lower" if "H" in bill_id else "upper" # secondary_chamber = 'upper' if primary_chamber == 'lower' else 'lower' title = page.xpath("//span[@id='lblAbstract']")[0].text if title is None: msg = "%s detail page was missing title info." self.logger.warning(msg % bill_id) return # bill subject subject_pos = title.find("-") subjects = [s.strip() for s in title[: subject_pos - 1].split(",")] subjects = filter(None, subjects) bill = Bill( bill_id, legislative_session=session, chamber=primary_chamber, title=title, classification=bill_type, ) for subject in subjects: bill.add_subject(subject) if secondary_bill_id: bill.add_identifier(secondary_bill_id) if page.xpath('//span[@id="lblCompNumber"]/a'): companion_id = page.xpath('//span[@id="lblCompNumber"]/a')[0].text_content().strip() bill.add_related_bill( identifier=companion_id, legislative_session=session, relation_type="companion", ) bill.add_source(bill_url) # Primary Sponsor sponsor = ( page.xpath("//span[@id='lblBillPrimeSponsor']")[0] .text_content() .split("by")[-1] ) sponsor = sponsor.replace("*", "").strip() if sponsor: bill.add_sponsorship( sponsor, classification="primary", entity_type="person", primary=True ) # bill text btext = page.xpath("//span[@id='lblBillNumber']/a")[0] bill.add_version_link( "Current Version", btext.get("href"), media_type="application/pdf" ) # documents summary = page.xpath('//a[contains(@href, "BillSummaryArchive")]') if summary: bill.add_document_link("Summary", summary[0].get("href")) fiscal = page.xpath('//span[@id="lblFiscalNote"]//a') if fiscal: bill.add_document_link("Fiscal Note", fiscal[0].get("href")) amendments = page.xpath('//a[contains(@href, "/Amend/")]') for amendment in amendments: bill.add_document_link("Amendment " + amendment.text, amendment.get("href")) # amendment notes in image with alt text describing doc inside <a> amend_fns = page.xpath('//img[contains(@alt, "Fiscal Memo")]') for afn in amend_fns: bill.add_document_link( afn.get("alt"), afn.getparent().get("href"), on_duplicate="ignore" ) # actions atable = page.xpath("//table[@id='gvBillActionHistory']")[0] actions_from_table(bill, atable) # if there is a matching bill if secondary_bill_id: # secondary sponsor secondary_sponsor = ( page.xpath("//span[@id='lblCompPrimeSponsor']")[0] .text_content() .split("by")[-1] ) secondary_sponsor = ( secondary_sponsor.replace("*", "").replace(")", "").strip() ) # Skip black-name sponsors. if secondary_sponsor: bill.add_sponsorship( secondary_sponsor, classification="primary", entity_type="person", primary=True, ) # secondary actions cotable = page.xpath("//table[@id='gvCoActionHistory']")[0] actions_from_table(bill, cotable) # votes yield from self.scrape_vote_events(bill, page, bill_url) bill.actions.sort(key=lambda a: a["date"]) yield bill
def test_full_bill(): create_jurisdiction() sp = ScrapePerson('Adam Smith') org = ScrapeOrganization(name='House', classification='lower') com = ScrapeOrganization(name='Arbitrary Committee', classification='committee', parent_id=org._id) oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill.subject = ['taxes', 'axes'] bill.add_identifier('SB 9') bill.add_title('Tack & Axe Tax Act') bill.add_action('introduced in house', '1900-04-01', chamber='lower') act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower') act.add_related_entity('arbitrary committee', 'organization', com._id) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person', primary=False, entity_id=sp._id) bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person', primary=True) bill.add_abstract('This is an act about axes and taxes and tacks.', note="official", date='1969-10-20') bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf', media_type='application/pdf') bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html') bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html') bill.add_source('http://example.com/source') # import bill oi = OrganizationImporter('jid') oi.import_data([org.as_dict(), com.as_dict()]) pi = PersonImporter('jid') pi.import_data([sp.as_dict()]) BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier='HB 1') assert b.from_organization.classification == 'lower' assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ['taxes', 'axes'] assert b.abstracts.get().note == 'official' assert b.abstracts.get().date == '1969-10-20' # other_title, other_identifier added assert b.other_titles.get().title == 'Tack & Axe Tax Act' assert b.other_identifiers.get().identifier == 'SB 9' # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get( classification='lower') assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert (actions[1].related_entities.get().organization == Organization.objects.get(classification='committee')) # related_bills were added rb = b.related_bills.get() assert rb.identifier == 'HB 99' # and bill got resolved assert rb.related_bill.identifier == 'HB 99' # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 person = Person.objects.get(name='Adam Smith') for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1
def scrape_bill(self, bill_num, session): chamber_map = {'House': 'lower', 'Senate': 'upper', 'LSO': 'executive'} # Sample with all keys: https://gist.github.com/showerst/d6cd03eff3e8b12ab01dbb219876db45 bill_json_url = 'http://wyoleg.gov/LsoService/api/BillInformation/{}/' \ '{}?calendarDate='.format( session, bill_num) response = self.get(bill_json_url) bill_json = json.loads(response.content.decode('utf-8')) chamber = 'lower' if bill_json['bill'][0] else 'upper' bill = Bill( identifier=bill_json['bill'], legislative_session=session, title=bill_json['catchTitle'], chamber=chamber, classification="bill", ) bill.add_title(bill_json['billTitle']) source_url = 'http://lso.wyoleg.gov/Legislation/{}/{}'.format( session, bill_json['bill']) bill.add_source(source_url) for action_json in bill_json['billActions']: utc_action_date = self.parse_local_date(action_json['statusDate']) actor = None if action_json['location'] and action_json[ 'location'] in chamber_map: actor = chamber_map[action_json['location']] action = bill.add_action( chamber=actor, description=action_json['statusMessage'], date=utc_action_date, classification=categorize_action(action_json['statusMessage']), ) action.extras = { 'billInformationID': action_json['billInformationID'] } if bill_json['introduced']: url = 'http://wyoleg.gov/{}'.format(bill_json['introduced']) bill.add_version_link( note="Introduced", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['enrolledAct']: url = 'http://wyoleg.gov/{}'.format(bill_json['enrolledAct']) bill.add_version_link( note="Enrolled", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['fiscalNote']: url = 'http://wyoleg.gov/{}'.format(bill_json['fiscalNote']) bill.add_document_link( note="Fiscal Note", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['digest']: url = 'http://wyoleg.gov/{}'.format(bill_json['digest']) bill.add_document_link( note="Bill Digest", url=url, media_type="application/pdf" # optional but useful! ) if bill_json['vetoes']: for veto in bill_json['vetoes']: url = 'http://wyoleg.gov/{}'.format(veto['vetoLinkPath']) bill.add_version_link( note=veto['vetoLinkText'], url=url, media_type="application/pdf" # optional but useful! ) for amendment in bill_json['amendments']: # http://wyoleg.gov/2018/Amends/SF0050H2001.pdf url = 'http://wyoleg.gov/{}/Amends/{}.pdf'.format( session, amendment['amendmentNumber']) if amendment['sponsor'] and amendment['status']: title = 'Amendment {} ({}) - {} ({})'.format( amendment['amendmentNumber'], amendment['order'], amendment['sponsor'], amendment['status'], ) else: title = 'Amendment {} ({})'.format( amendment['amendmentNumber'], amendment['order'], ) # add versions of the bill text version = bill.add_version_link( note=title, url=url, media_type="application/pdf", ) version['extras'] = { 'amendmentNumber': amendment['amendmentNumber'], 'sponsor': amendment['sponsor'], } for sponsor in bill_json['sponsors']: status = 'primary' if sponsor['primarySponsor'] else 'cosponsor' sponsor_type = 'person' if sponsor[ 'sponsorTitle'] else 'organization' bill.add_sponsorship(name=sponsor['name'], classification=status, entity_type=sponsor_type, primary=sponsor['primarySponsor']) if bill_json['summary']: bill.add_abstract( note="summary", abstract=bill_json['summary'], ) if bill_json['enrolledNumber']: bill.extras['wy_enrolled_number'] = bill_json['enrolledNumber'] if bill_json['chapter']: bill.extras['chapter'] = bill_json['chapter'] if bill_json['effectiveDate']: eff = datetime.datetime.strptime(bill_json['effectiveDate'], '%m/%d/%Y') bill.extras['effective_date'] = eff.strftime('%Y-%m-%d') bill.extras['wy_bill_id'] = bill_json['id'] for vote_json in bill_json['rollCalls']: yield from self.scrape_vote(bill, vote_json, session) yield bill
def scrape_events_range(self, start_date, end_date): def daterange(start_date, end_date): number_of_days = int((end_date - start_date).days) for n in range(number_of_days): yield start_date + dt.timedelta(n) for date in daterange(start_date, end_date): calendar_day_url = CALENDAR_DAY_TEMPLATE.format(date.year, date.month - 1, date.day) events = self.extract_events_by_url(calendar_day_url) for event in events: tz = pytz.timezone("America/Toronto") time = dt.datetime.strptime(event["time"], "%I:%M %p") start = tz.localize(date.replace(hour=time.hour, minute=time.minute, second=0, microsecond=0)) org_name = event["meeting"] e = Event( name=org_name, start_time=start, timezone=tz.zone, location_name=event["location"], status=STATUS_DICT.get(event["meeting_status"]), ) e.extras = {"meeting_number": event["no"], "tmmis_meeting_id": event["meeting_id"]} e.add_source(calendar_day_url) e.add_participant(name=org_name, type="organization") def is_agenda_available(event): return event["publishing_status"] in ["Agenda Published", "Minutes Published"] def is_council(event): return True if event["meeting"] == self.jurisdiction.name else False if is_agenda_available(event): agenda_url_template = ( AGENDA_FULL_COUNCIL_TEMPLATE if is_council(event) else AGENDA_FULL_STANDARD_TEMPLATE ) agenda_url = agenda_url_template.format(event["meeting_id"]) full_identifiers = list(self.full_identifiers(event["meeting_id"], is_council(event))) event_map_url_template = ( "http://app.toronto.ca/tmmis/getAddressList.do?function=getMeetingAddressList&meetingId={}" ) event_map_url = event_map_url_template.format(event["meeting_id"]) addresses_d = self.addressesByAgendaId(event_map_url) e.add_source(agenda_url) agenda_items = self.agenda_from_url(agenda_url) for i, item in enumerate(agenda_items): a = e.add_agenda_item(item["title"]) a.add_classification(item["type"].lower()) a["order"] = str(i) def normalize_wards(raw): if not raw: raw = "All" if raw == "All": return raw.lower() else: return raw.split(", ") wards = normalize_wards(item["wards"]) identifier_regex = re.compile(r"^[0-9]{4}\.([A-Z]{2}[0-9]+\.[0-9]+)$") [full_identifier] = [ id for id in full_identifiers if identifier_regex.match(id).group(1) == item["identifier"] ] a.add_bill(full_identifier) if full_identifier not in self.seen_agenda_items: b = Bill( # TODO: Fix this hardcode legislative_session="2014-2018", identifier=full_identifier, title=item["title"], from_organization={"name": self.jurisdiction.name}, ) b.add_source(agenda_url) b.add_document_link( note="canonical", media_type="text/html", url=AGENDA_ITEM_TEMPLATE.format(full_identifier), ) b.extras["wards"] = wards addresses = addresses_d.get(full_identifier) if addresses: b.extras["locations"] = [] for address in addresses: location = {"address": {"full_address": address}} b.extras["locations"].append(location) self.seen_agenda_items.append(full_identifier) yield b yield e
def scrape_bill(self, chamber, session, bill_id, url): try: page = lxml.html.fromstring(self.get(url).text) except scrapelib.HTTPError as e: self.warning("error (%s) fetching %s, skipping" % (e, url)) return title = page.xpath( "string(//span[contains(@id, 'PlaceHolder1_txtST')])" ).strip() if not title: self.warning("blank bill on %s - skipping", url) return if "JR" in bill_id: bill_type = ["joint resolution"] elif "CR" in bill_id: bill_type = ["concurrent resolution"] elif "R" in bill_id: bill_type = ["resolution"] else: bill_type = ["bill"] bill = Bill( bill_id, legislative_session=session, chamber=chamber, title=title, classification=bill_type, ) bill.add_source(url) bill.subject = self.subject_map[bill_id] for link in page.xpath("//a[contains(@id, 'Auth')]"): name = link.xpath("string()").strip() if ":" in name: raise Exception(name) if "otherAuth" in link.attrib["id"]: bill.add_sponsorship( name, classification="cosponsor", entity_type="person", primary=False, ) else: bill.add_sponsorship( name, classification="primary", entity_type="person", primary=True ) act_table = page.xpath("//table[contains(@id, 'Actions')]")[0] for tr in act_table.xpath("tr")[2:]: action = tr.xpath("string(td[1])").strip() if not action or action == "None": continue date = tr.xpath("string(td[3])").strip() date = datetime.datetime.strptime(date, "%m/%d/%Y").date() actor = tr.xpath("string(td[4])").strip() if actor == "H": actor = "lower" elif actor == "S": actor = "upper" attrs = self.categorizer.categorize(action) related_entities = [] for item in attrs["committees"]: related_entities.append({"type": "committee", "name": item}) for item in attrs["legislators"]: related_entities.append({"type": "legislator", "name": item}) bill.add_action( description=action, date=date.strftime("%Y-%m-%d"), chamber=actor, classification=attrs["classification"], related_entities=related_entities, ) version_table = page.xpath("//table[contains(@id, 'Versions')]")[0] # Keep track of already seen versions to prevent processing duplicates. version_urls = [] for link in version_table.xpath(".//a[contains(@href, '.PDF')]"): version_url = link.attrib["href"] if version_url in version_urls: self.warning("Skipping duplicate version URL.") continue else: version_urls.append(version_url) name = link.text.strip() if re.search("COMMITTEE REPORTS|SCHEDULED CCR", version_url, re.IGNORECASE): bill.add_document_link( note=name, url=version_url, media_type="application/pdf" ) continue bill.add_version_link( note=name, url=version_url, media_type="application/pdf" ) self.scrape_amendments(bill, page) for link in page.xpath(".//a[contains(@href, '_VOTES')]"): if "HT_" not in link.attrib["href"]: yield from self.scrape_votes(bill, self.urlescape(link.attrib["href"])) # # If the bill has no actions and no versions, it's a bogus bill on # # their website, which appears to happen occasionally. Skip. has_no_title = bill.title == "Short Title Not Found." if has_no_title: # If there's no title, this is an empty page. Skip! return else: # Otherwise, save the bills. yield bill
def scrape(self, window=28, matter_ids=None): '''By default, scrape board reports updated in the last 28 days. Optionally specify a larger or smaller window of time from which to scrape updates, or specific matters to scrape. Note that passing a value for :matter_ids supercedes the value of :window, such that the given matters will be scraped regardless of when they were updated. Optional parameters :window (numeric) - Amount of time for which to scrape updates, e.g. a window of 7 will scrape legislation updated in the last week. Pass a window of 0 to scrape all legislation. :matter_ids (str) - Comma-separated list of matter IDs to scrape ''' if matter_ids: matters = [ self.matter(matter_id) for matter_id in matter_ids.split(',') ] matters = filter( None, matters) # Skip matters that are not yet in Legistar elif float(window): # Support for partial days, i.e., window=0.15 n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) matters = self.matters(n_days_ago) else: # Scrape all matters, including those without a last-modified date matters = self.matters() n_days_ago = datetime.datetime.utcnow() - datetime.timedelta( float(window)) for matter in matters: # If this Boolean field is True, then do not scrape the Bill. # This issue explains why a restricted Bill might appear (unwelcome) in the Legistar API: # https://github.com/datamade/la-metro-councilmatic/issues/345#issuecomment-421184826 if matter['MatterRestrictViewViaWeb']: continue matter_id = matter['MatterId'] date = matter['MatterIntroDate'] title = matter['MatterTitle'] identifier = matter['MatterFile'] if not all((date, title, identifier)): continue bill_session = self.session(self.toTime(date)) bill_type = BILL_TYPES[matter['MatterTypeName']] if identifier.startswith('S'): alternate_identifiers = [identifier] identifier = identifier[1:] else: alternate_identifiers = [] bill = Bill(identifier=identifier, legislative_session=bill_session, title=title, classification=bill_type, from_organization={"name": "Board of Directors"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') for identifier in alternate_identifiers: bill.add_identifier(identifier) for action, vote in self.actions(matter_id): act = bill.add_action(**action) if action['description'] == 'Referred': body_name = matter['MatterBodyName'] act.add_related_entity( body_name, 'organization', entity_id=_make_pseudo_id(name=body_name)) result, votes = vote if result: vote_event = VoteEvent( legislative_session=bill.legislative_session, motion_text=action['description'], organization=action['organization'], classification=None, start_date=action['date'], result=result, bill=bill) vote_event.add_source(legistar_web) vote_event.add_source(legistar_api + '/histories') for vote in votes: raw_option = vote['VoteValueName'].lower() clean_option = self.VOTE_OPTIONS.get( raw_option, raw_option) vote_event.vote(clean_option, vote['VotePersonName'].strip()) yield vote_event for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) for topic in self.topics(matter_id): bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: # Get data (i.e., json) for the related bill. # Then, we can find the 'MatterFile' (i.e., identifier) and the 'MatterIntroDate' (i.e., to determine its legislative session). # Sometimes, the related bill does not yet exist: in this case, throw an error, and continue. related_bill = self.endpoint( '/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: continue else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill( identifier=identifier, legislative_session=related_bill_session, relation_type='companion') # Currently, the relation type for bills can be one of a few possibilites: https://github.com/opencivicdata/python-opencivicdata/blob/master/opencivicdata/common.py#L104 # Metro simply understands these as related files, suggesting that they receive a relation of 'companion'. bill.add_version_link( 'Board Report', 'https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID={}&GUID=LATEST&Title=Board+Report' .format(matter_id), media_type="application/pdf") for attachment in self.attachments(matter_id): if attachment['MatterAttachmentName']: bill.add_document_link( attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type="application/pdf") bill.extras = {'local_classification': matter['MatterTypeName']} text = self.text(matter_id) if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'] if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace( u'\u0000', '') yield bill
def scrape_bill(self, chamber, session, bill_id, short_title=None): """ Scrapes documents, actions, vote counts and votes for bills from the 2009 session and above. """ url = BILL_URL % (session, bill_id.replace(' ', '')) bill_page = self.get(url, verify=False).text html = lxml.html.fromstring(bill_page) html.make_links_absolute('http://legislature.idaho.gov/legislation/%s/' % session) bill_tables = html.xpath('//table[contains(@class, "bill-table")]') title = bill_tables[1].text_content().strip() bill_type = get_bill_type(bill_id) bill = Bill(legislative_session=session, chamber=chamber, identifier=bill_id, title=title, classification=bill_type) bill.add_source(url) for subject in self._subjects[bill_id.replace(' ', '')]: bill.add_subject(subject) if short_title and title.lower() != short_title.lower(): bill.add_title(short_title, 'short title') # documents doc_links = html.xpath('//div[contains(@class,"pf-content")]//a') for link in doc_links: name = link.text_content().strip() href = link.get('href') if 'Engrossment' in name or 'Bill Text' in name: bill.add_version_link(note=name, url=href, media_type="application/pdf") else: bill.add_document_link(note=name, url=href, media_type="application/pdf") def _split(string): return re.split(r"\w+[,|AND]\s+", string) # sponsors range from a committee to one legislator to a group of legs sponsor_lists = bill_tables[0].text_content().split('by') if len(sponsor_lists) > 1: for sponsors in sponsor_lists[1:]: if 'COMMITTEE' in sponsors.upper(): bill.add_sponsorship(name=sponsors.strip(), entity_type="organization", primary=True, classification='primary') else: for person in _split(sponsors): person = person.strip() if person != "": bill.add_sponsorship(classification='primary', name=person, entity_type="person", primary=True) actor = chamber last_date = None for row in bill_tables[2]: # lots of empty rows if len(row) == 1: continue _, date, action, _ = [x.text_content().strip() for x in row] if date: last_date = date else: date = last_date date = datetime.datetime.strptime(date + '/' + session[0:4], "%m/%d/%Y").strftime('%Y-%m-%d') if action.startswith('House'): actor = 'lower' elif action.startswith('Senate'): actor = 'upper' # votes if 'AYES' in action or 'NAYS' in action: yield from self.parse_vote(actor, date, row[2], session, bill_id, chamber, url) # bill.add_vote_event(vote) # some td's text is seperated by br elements if len(row[2]): action = "".join(row[2].itertext()) action = action.replace(u'\xa0', ' ').strip() atype = get_action(actor, action) bill.add_action(action, date, chamber=actor, classification=atype) # after voice vote/roll call and some actions the bill is sent # 'to House' or 'to Senate' if 'to House' in action: actor = 'lower' elif 'to Senate' in action: actor = 'upper' yield bill
def get_bill(self, matter): '''Make Bill object from given matter.''' ''' Currently, NYC Legistar does not have conventional "Types" for three newly added committees: https://legistar.council.nyc.gov/Departments.aspx We communicated the issue to NYC, and until we learn more, we will skip the bills attached to those committees. ''' orgs_without_type = ['Charter Revision Commission 2019', 'New York City Advisory Commission on Property Tax Reform', 'Democratic Conference of the Council of the City of New York'] if matter['MatterBodyName'].strip() in orgs_without_type: return None matter_id = matter['MatterId'] if matter_id in DUPLICATED_ACTIONS: return None date = matter['MatterIntroDate'] title = matter['MatterName'] identifier = matter['MatterFile'] if not all((date, title, identifier)): return None leg_type = BILL_TYPES[matter['MatterTypeName']] bill_session = self.sessions(self.toTime(date)) bill = Bill(identifier=identifier, title=title, classification=leg_type, legislative_session=bill_session, from_organization={"name": "New York City Council"}) legistar_web = matter['legistar_url'] legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id) bill.add_source(legistar_web, note='web') bill.add_source(legistar_api, note='api') if matter['MatterTitle']: bill.add_title(matter['MatterTitle']) if matter['MatterEXText5']: bill.add_abstract(matter['MatterEXText5'], note='') try: for sponsorship in self.sponsorships(matter_id): bill.add_sponsorship(**sponsorship) except KeyError: self.version_errors.append(legistar_web) return None for attachment in self.attachments(matter_id): if attachment['MatterAttachmentId'] == 103315: # Duplicate return None if attachment['MatterAttachmentName']: bill.add_document_link(attachment['MatterAttachmentName'], attachment['MatterAttachmentHyperlink'], media_type='application/pdf') for topic in self.topics(matter_id) : bill.add_subject(topic['MatterIndexName'].strip()) for relation in self.relations(matter_id): try: related_bill = self.endpoint('/matters/{0}', relation['MatterRelationMatterId']) except scrapelib.HTTPError: return None else: date = related_bill['MatterIntroDate'] related_bill_session = self.session(self.toTime(date)) identifier = related_bill['MatterFile'] bill.add_related_bill(identifier=identifier, legislative_session=related_bill_session, relation_type='companion') try: text = self.text(matter_id) except KeyError: self.version_errors.append(legistar_web) return None bill.extras['local_classification'] = matter['MatterTypeName'] if text: if text['MatterTextPlain']: bill.extras['plain_text'] = text['MatterTextPlain'].replace(u'\u0000', '') if text['MatterTextRtf']: bill.extras['rtf_text'] = text['MatterTextRtf'].replace(u'\u0000', '') return bill
def scrape(self): for leg_summary in self.legislation( created_after=datetime.datetime(2014, 1, 1)): leg_type = BILL_TYPES[leg_summary['Type']] bill = Bill(identifier=leg_summary['File\xa0#'], title=leg_summary['Title'], legislative_session=None, classification=leg_type, from_organization={"name": "New York City Council"}) bill.add_source(leg_summary['url']) leg_details = self.legDetails(leg_summary['url']) history = self.history(leg_summary['url']) bill.add_title(leg_details['Name'], note='created by administrative staff') if 'Summary' in leg_details: bill.add_abstract(leg_details['Summary'], note='') if leg_details['Law number']: bill.add_identifier(leg_details['Law number'], note='law number') for sponsorship in self._sponsors(leg_details.get('Sponsors', [])): sponsor, sponsorship_type, primary = sponsorship bill.add_sponsorship(sponsor, sponsorship_type, 'person', primary, entity_id=_make_pseudo_id(name=sponsor)) for attachment in leg_details.get('Attachments', []): bill.add_document_link(attachment['label'], attachment['url'], media_type="application/pdf") history = list(history) if history: earliest_action = min( self.toTime(action['Date']) for action in history) bill.legislative_session = self.sessions(earliest_action) else: bill.legislative_session = str(self.SESSION_STARTS[0]) for action in history: action_description = action['Action'] if not action_description: continue action_class = ACTION_CLASSIFICATION[action_description] action_date = self.toDate(action['Date']) responsible_org = action['Action\xa0By'] if responsible_org == 'City Council': responsible_org = 'New York City Council' elif responsible_org == 'Administration': responsible_org = 'Mayor' if responsible_org == 'Town Hall Meeting': continue else: act = bill.add_action( action_description, action_date, organization={'name': responsible_org}, classification=action_class) if 'url' in action['Action\xa0Details']: action_detail_url = action['Action\xa0Details']['url'] if action_class == 'committee-referral': action_details = self.actionDetails(action_detail_url) referred_committee = action_details[ 'Action text'].rsplit(' to the ', 1)[-1] act.add_related_entity( referred_committee, 'organization', entity_id=_make_pseudo_id(name=referred_committee)) result, votes = self.extractVotes(action_detail_url) if votes: action_vote = VoteEvent( legislative_session=bill.legislative_session, motion_text=action_description, organization={'name': responsible_org}, classification=action_class, start_date=action_date, result=result, bill=bill) action_vote.add_source(action_detail_url) for option, voter in votes: action_vote.vote(option, voter) yield action_vote text = self.text(leg_summary['url']) if text: bill.extras = { 'local_classification': leg_summary['Type'], 'full_text': text } else: bill.extras = {'local_classification': leg_summary['Type']} yield bill
def scrape_bill(self, chamber, session, bill_id): # try and get bill for the first year of the session biennium url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[:4], bill_id.replace(' ', '-')) html = self.get(url).text # Otherwise, try second year of the session biennium if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): url = 'http://legislature.mi.gov/doc.aspx?%s-%s' % ( session[-4:], bill_id.replace(' ', '-')) html = self.get(url).text if ('Page Not Found' in html or 'The bill you are looking for is not available yet' in html): self.warning("Cannot open bill page for {}; skipping".format(bill_id)) return doc = lxml.html.fromstring(html) doc.make_links_absolute('http://legislature.mi.gov') title = doc.xpath('//span[@id="frg_billstatus_ObjectSubject"]')[0].text_content() # get B/R/JR/CR part and look up bill type bill_type = bill_types[bill_id.split(' ')[0][1:]] bill = Bill(bill_id, session, title, chamber=chamber, classification=bill_type) bill.add_source(url) # sponsors sponsors = doc.xpath('//span[@id="frg_billstatus_SponsorList"]/a') for sponsor in sponsors: name = sponsor.text.replace(u'\xa0', ' ') # sometimes district gets added as a link if name.isnumeric(): continue if len(sponsors) > 1: classification = ( 'primary' if sponsor.tail and 'primary' in sponsor.tail else 'cosponsor' ) else: classification = 'primary' bill.add_sponsorship( name=name, chamber=chamber, entity_type='person', primary=classification == 'primary', classification=classification, ) bill.subject = doc.xpath('//span[@id="frg_billstatus_CategoryList"]/a/text()') # actions (skip header) for row in doc.xpath('//table[@id="frg_billstatus_HistoriesGridView"]/tr')[1:]: tds = row.xpath('td') # date, journal link, action date = tds[0].text_content() journal = tds[1].text_content() action = tds[2].text_content() date = TIMEZONE.localize(datetime.datetime.strptime(date, "%m/%d/%Y")) # instead of trusting upper/lower case, use journal for actor actor = 'upper' if 'SJ' in journal else 'lower' classification = categorize_action(action) bill.add_action(action, date, chamber=actor, classification=classification) # check if action mentions a sub submatch = re.search(r'WITH SUBSTITUTE\s+([\w\-\d]+)', action, re.IGNORECASE) if submatch and tds[2].xpath('a'): version_url = tds[2].xpath('a/@href')[0] version_name = tds[2].xpath('a/text()')[0].strip() version_name = 'Substitute {}'.format(version_name) self.info("Found Substitute {}".format(version_url)) if version_url.lower().endswith('.pdf'): mimetype = 'application/pdf' elif version_url.lower().endswith('.htm'): mimetype = 'text/html' bill.add_version_link(version_name, version_url, media_type=mimetype) # check if action mentions a vote rcmatch = re.search(r'Roll Call # (\d+)', action, re.IGNORECASE) if rcmatch: rc_num = rcmatch.groups()[0] # in format mileg.aspx?page=getobject&objectname=2011-SJ-02-10-011 journal_link = tds[1].xpath('a/@href') if journal_link: objectname = journal_link[0].rsplit('=', 1)[-1] chamber_name = {'upper': 'Senate', 'lower': 'House'}[actor] vote_url = BASE_URL + '/documents/%s/Journal/%s/htm/%s.htm' % ( session, chamber_name, objectname) results = self.parse_roll_call(vote_url, rc_num) if results is not None: vote_passed = len(results['yes']) > len(results['no']) vote = VoteEvent( start_date=date, chamber=actor, bill=bill, motion_text=action, result='pass' if vote_passed else 'fail', classification='passage', ) # check the expected counts vs actual count = re.search(r'YEAS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results['yes']): self.warning('vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(results['yes']))) count = re.search(r'NAYS (\d+)', action, re.IGNORECASE) count = int(count.groups()[0]) if count else 0 if count != len(results['no']): self.warning('vote count mismatch for %s %s, %d != %d' % (bill_id, action, count, len(results['no']))) vote.set_count('yes', len(results['yes'])) vote.set_count('no', len(results['no'])) vote.set_count('other', len(results['other'])) for name in results['yes']: vote.yes(name) for name in results['no']: vote.no(name) for name in results['other']: vote.vote('other', name) vote.add_source(vote_url) yield vote else: self.warning("missing journal link for %s %s" % (bill_id, journal)) # versions for row in doc.xpath('//table[@id="frg_billstatus_DocumentGridTable"]/tr'): parsed = self.parse_doc_row(row) if parsed: name, url = parsed if url.endswith('.pdf'): mimetype = 'application/pdf' elif url.endswith('.htm'): mimetype = 'text/html' bill.add_version_link(name, url, media_type=mimetype) # documents for row in doc.xpath('//table[@id="frg_billstatus_HlaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) for row in doc.xpath('//table[@id="frg_billstatus_SfaTable"]/tr'): document = self.parse_doc_row(row) if document: name, url = document bill.add_document_link(name, url) yield bill