def scrape(self): urls = Urls(dict(list=legislators_url), self) council = Organization( 'Temecula City Council', classification='legislature') council.add_source(urls.list.url) yield council for tr in urls.list.xpath('//table[2]//tr')[1:]: # Parse some attributes. name, role = tr.xpath('td/p[1]//font/text()') image = tr.xpath('td/img/@src').pop() # Create legislator. person = Person(name, image=image) # Add membership on council. memb = person.add_membership(council, role=role) # Add email address. email, detail_url = tr.xpath('td//a/@href') email = email[7:] memb.contact_details.append( dict(type='email', value=email, note='work')) # Add sources. person.add_source(urls.list.url) person.add_source(detail_url) yield person
def scrape_approp_subcommittees(self): URL = 'http://www.senate.michigan.gov/committee/appropssubcommittee.html' html = self.get(URL).text doc = lxml.html.fromstring(html) for strong in doc.xpath('//strong'): com = Organization( name=strong.text.strip(), parent_id=self._senate_appropriations, classification='committee', ) com.add_source(URL) legislators = strong.getnext().tail.replace('Senators', '').strip() for leg in re.split(', | and ', legislators): if leg.endswith('(C)'): role = 'chairman' leg = leg[:-4] elif leg.endswith('(VC)'): role = 'vice chairman' leg = leg[:-5] elif leg.endswith('(MVC)'): role = 'minority vice chairman' leg = leg[:-6] else: role = 'member' com.add_member(leg, role=role) yield com
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="entry-content"]//p/strong') for councillor in councillors: district = councillor.xpath('./ancestor::p/preceding-sibling::h2')[-1].text_content().split('–'.decode('utf-8'))[0] name = ' '.join(councillor.text_content().split()[-2:]).replace('-Â'.decode('utf-8'), '') role = councillor.text_content().replace(name, '').split('-')[0] if 'SAO' in role or not role: continue org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) yield org p = Person(primary_org='legislature', name=name, district=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(org, role=role, district=district) info = councillor.xpath('./ancestor::p/text()') for contact in info: if 'NT' in contact: membership.add_contact_detail('address', contact.strip(), 'legislature') if 'Tel' in contact: contact = contact.replace('Tel. ', '').replace('(', '').replace(') ', '-').strip() membership.add_contact_detail('voice', contact, 'legislature') if 'Fax' in contact: contact = contact.replace('Fax ', '').replace('(', '').replace(') ', '-').strip() membership.add_contact_detail('fax', contact, 'legislature') email = self.get_email(councillor, './parent::p') membership.add_contact_detail('email', email) if 'Website' in councillor.xpath('./parent::p')[0].text_content(): p.add_link(councillor.xpath('./parent::p//a')[1].attrib['href']) yield p
def scrape_comm(self, chamber): url = 'http://billstatus.ls.state.ms.us/htms/%s_cmtememb.xml' % chamber comm_page = self.get(url) root = lxml.etree.fromstring(comm_page.content) if chamber == 'h': chamber = "lower" else: chamber = "upper" for mr in root.xpath('//COMMITTEE'): name = mr.xpath('string(NAME)') comm = Organization(name, chamber=chamber, classification='committee' ) chair = mr.xpath('string(CHAIR)') chair = chair.replace(", Chairman", "") role = "Chairman" if len(chair) > 0: comm.add_member(chair, role=role) vice_chair = mr.xpath('string(VICE_CHAIR)') vice_chair = vice_chair.replace(", Vice-Chairman", "") role = "Vice-Chairman" if len(vice_chair) > 0: comm.add_member(vice_chair, role=role) members = mr.xpath('string(MEMBERS)').split(";") if "" in members: members.remove("") for leg in members: leg = leg.strip() comm.add_member(leg) comm.add_source(url) yield comm
def scrape_chamber(self, chamber): session = self.latest_session() # since we are scraping only latest_session session_id = session_metadata.session_id_meta_data[session] client = AZClient() committees = client.list_committees( sessionId=session_id, includeOnlyCommitteesWithAgendas='false', legislativeBody='S' if chamber == 'upper' else 'H', ) for committee in committees.json(): c = Organization(name=committee['CommitteeName'], chamber=chamber, classification='committee') details = client.get_standing_committee( sessionId=session_id, legislativeBody='S' if chamber == 'upper' else 'H', committeeId=committee['CommitteeId'], includeMembers='true', ) for member in details.json()[0]['Members']: c.add_member( u'{} {}'.format(member['FirstName'], member['LastName']), role=parse_role(member), ) c.add_source(details.url) c.add_source(committees.url) yield c
def scrape_committee(self, chamber, name, url): page = self.get(url).text page = lxml.html.fromstring(page) if page.xpath("//h3[. = 'Joint Committee']"): chamber = 'joint' subcommittee = page.xpath("//h3[@align='center']/text()")[0] if "Subcommittee" not in subcommittee: comm = Organization( chamber=chamber, name=name, classification='committee') else: comm = Organization( name=subcommittee, classification='committee', parent_id={'classification': chamber, 'name': name}) comm.add_source(url) for link in page.xpath("//a[contains(@href, 'member=')]"): member = link.text.strip() mtype = link.xpath("string(../preceding-sibling::td[1])") mtype = mtype.strip(": \r\n\t").lower() comm.add_member(member, mtype) if not comm._related: self.warning('not saving %s, appears to be empty' % name) else: yield comm
def scrape_committees(self, session): session_key = SESSION_KEYS[session] committees_response = self.api_client.get('committees', session=session_key) legislators = index_legislators(self, session_key) for committee in committees_response: org = Organization( chamber={'S': 'upper', 'H': 'lower', 'J': 'legislature'}[committee['HouseOfAction']], name=committee['CommitteeName'], classification='committee') org.add_source( 'https://olis.leg.state.or.us/liz/{session}' '/Committees/{committee}/Overview'.format(session=session_key, committee=committee['CommitteeName'])) members_response = self.api_client.get('committee_members', session=session_key, committee=committee['CommitteeCode']) for member in members_response: try: member_name = legislators[member['LegislatorCode']] except KeyError: logger.warn('Legislator {} not found in session {}'.format( member['LegislatorCode'], session_key)) member_name = member['LegislatorCode'] org.add_member(member_name, role=member['Title'] if member['Title'] else '') yield org
def _scrape_upper_committee(self, name, url2): cat = "Assignments.asp" url3 = url2.replace("default.asp", cat) committee = Organization(name, chamber="upper", classification="committee" ) committee.add_source(url2) page = self.lxmlize(url3) members = page.xpath('//table[@id="table38"]//font/a/b') for link in members: role = "member" if link == members[0]: role = "Chairman" if link == members[1]: role = "Vice-Chairman" name = link.xpath('string()') name = name.replace('Senator ', '') name = re.sub('[\s]{2,}', ' ', name).strip() committee.add_member(name, role) yield committee
def scrape_chamber(self, chamber, session): url = "%s/GetActiveCommittees?biennium=%s" % (self._base_url, session) page = self.get(url) page = lxml.etree.fromstring(page.content) for comm in xpath(page, "//wa:Committee"): agency = xpath(comm, "string(wa:Agency)") comm_chamber = {'House': 'lower', 'Senate': 'upper'}[agency] if comm_chamber != chamber: continue name = xpath(comm, "string(wa:Name)") # comm_id = xpath(comm, "string(wa:Id)") # acronym = xpath(comm, "string(wa:Acronym)") phone = xpath(comm, "string(wa:Phone)") comm = Organization(name, chamber=chamber, classification='committee') comm.extras['phone'] = phone self.scrape_members(comm, agency) comm.add_source(url) if not comm._related: self.warning('empty committee: %s', name) else: yield comm
def scrape_lower_committee(self, name, url): page = self.lxmlize(url) committee = Organization(chamber='lower', name=name, classification="committee") committee.add_source(url) seen = set() member_links = self.get_nodes( page, '//div[@class="mod-inner"]//a[contains(@href, "mem")]') for member_link in member_links: member_name = None member_role = None member_name = member_link.text if member_name is None: continue # Figure out if this person is the chair. if member_link == member_links[0]: member_role = 'chair' else: member_role = 'member' if name not in seen: committee.add_member(member_name, member_role) seen.add(member_name) return committee
def _scrape_lower_special_committees(self): url = 'http://house.louisiana.gov/H_Cmtes/SpecialCommittees.aspx' page = self.lxmlize(url) committee_list = page.xpath('//div[@class="accordion"]')[0] headers = committee_list.xpath('./h3') for header in headers: committee_name_text = header.xpath('string()') committee_name = committee_name_text.strip() committee_name = self._normalize_committee_name(committee_name) chamber = 'legislature' if committee_name.startswith('Joint') else 'lower' committee = Organization(committee_name, chamber=chamber, classification='committee') committee.add_source(url) committee_memberlist = header.xpath('./following-sibling::div[@class="pane"]' '//tr[@class="linkStyle2"]') for row in committee_memberlist: member_name = row.xpath('normalize-space(string(./th[1]))') member_name = self._normalize_member_name(member_name) member_role = row.xpath('normalize-space(string(./th[2]))') member_role = self._normalize_member_role(member_role) committee.add_member(member_name, member_role) yield committee
def add_committees(self, legislator_page, legislator, chamber, url): # as of today, both chambers do committees the same way! Yay! rows = self.get_nodes( legislator_page, '//div[@id="ContentPlaceHolder1_TabSenator_TabCommittees"]//table/' 'tr') if len(rows) == 0: return for row in rows[1:]: committee_name_text = self.get_node(row, './td[2]').text_content() committee_name = committee_name_text.strip() if not committee_name: continue role_text = self.get_node(row, './td[3]').text_content() role = role_text.strip() if committee_name not in self.committees: comm = Organization( name=committee_name, chamber=chamber, classification='committee') comm.add_source(url) self.committees[committee_name] = comm self.committees[committee_name].add_member( legislator.name, role=role, )
def test_full_organization(): org = ScrapeOrganization('United Nations', classification='international') org.add_identifier('un') org.add_name('UN', start_date='1945') org.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') org.add_link('http://example.com/link') org.add_source('http://example.com/source') # import org od = org.as_dict() OrganizationImporter('jurisdiction-id').import_data([od]) # get person from db and assert it imported correctly o = Organization.objects.get() assert 'ocd-organization' in o.id assert o.name == org.name assert o.identifiers.all()[0].identifier == 'un' assert o.identifiers.all()[0].scheme == '' assert o.other_names.all()[0].name == 'UN' assert o.other_names.all()[0].start_date == '1945' assert o.contact_details.all()[0].type == 'phone' assert o.contact_details.all()[0].value == '555-555-1234' assert o.contact_details.all()[0].note == 'this is fake' assert o.links.all()[0].url == 'http://example.com/link' assert o.sources.all()[0].url == 'http://example.com/source'
def scrape(self, chamber=None): if chamber: chambers = [chamber] else: chambers = ['upper', 'lower'] for chamber in chambers: insert = self.jurisdiction.session_slugs[self.latest_session()] chamber_names = {'lower': 'Assembly', 'upper': 'Senate'} list_url = '%s/%s/HomeCommittee/LoadCommitteeListTab' % (nelis_root, insert) html = self.get(list_url).text doc = lxml.html.fromstring(html) sel = 'panel%sCommittees' % chamber_names[chamber] ul = doc.xpath('//ul[@id="%s"]' % sel)[0] coms = ul.xpath('li/div/div/div[@class="col-md-4"]/a') for com in coms: name = com.text.strip() com_id = (re.match(r'.*/Committee/(?P<id>[0-9]+)/Overview', com.attrib['href']) .group('id')) com_url = '%s/%s/Committee/FillSelectedCommitteeTab?committeeOrSubCommitteeKey=%s'\ '&selectedTab=Overview' % (nelis_root, insert, com_id) org = Organization(name=name, chamber=chamber, classification="committee") org.add_source(com_url) self.scrape_comm_members(chamber, org, com_url) yield org
def scrape_approp_subcommittees(self, url): html = self.get(url).text doc = lxml.html.fromstring(html) for strong in doc.xpath('//strong'): com = Organization( name=strong.text.strip(), parent_id={ 'name': 'Appropriations', 'classification': 'committee', }, classification='committee', ) com.add_source(url) legislators = strong.getnext().tail.replace('Senators', '').strip() for leg in re.split(', | and ', legislators): if leg.endswith('(C)'): role = 'chairman' leg = leg[:-4] elif leg.endswith('(VC)'): role = 'vice chairman' leg = leg[:-5] elif leg.endswith('(MVC)'): role = 'minority vice chairman' leg = leg[:-6] else: role = 'member' com.add_member(leg, role=role) yield com
def scrape_senate_committee(self, url): html = self.get(url).text doc = lxml.html.fromstring(html) headers = doc.xpath('(//div[@class="row"])[2]//h1') assert len(headers) == 1 name = ' '.join(headers[0].xpath('./text()')) name = re.sub(r'\s+Committee.*$', '', name) com = Organization(chamber='upper', name=name, classification='committee') for member in doc.xpath('(//div[@class="row"])[3]/div[1]/ul[1]/li'): text = member.text_content() member_name = member.xpath('./a/text()')[0].replace('Representative ', '') if 'Committee Chair' in text: role = 'chair' elif 'Minority Vice' in text: role = 'minority vice chair' elif 'Vice' in text: role = 'majority vice chair' else: role = 'member' com.add_member(member_name, role=role) com.add_source(url) yield com
def scrape_comm(self, url, chamber): data = self.post(url).json()['Data'] for item in data: comm_name = item['CommitteeName'] committee = Organization(name=comm_name, chamber=chamber, classification='committee') chair_man = str(item['ChairName']) vice_chair = str(item['ViceChairName']) comm_id = item['CommitteeId'] comm_url = self.get_comm_url(chamber, comm_id, comm_name) members = self.scrape_member_info(comm_url) if vice_chair != 'None': committee.add_member(vice_chair, role='Vice-Chair') if chair_man != 'None': committee.add_member(chair_man, role='Chairman') for member in members: # vice_chair and chair_man already added. if chair_man not in member and vice_chair not in member: member = " ".join(member.split()) if member: committee.add_member(member) committee.add_source(comm_url) committee.add_source(url) yield committee
def scrape_reps_comm(self): # As of 1/27/15, the committee page has the wrong # session number (126th) at the top, but # has newly elected people, so we're rolling with it. url = 'http://legislature.maine.gov/house/hsecoms.htm' page = self.get(url).text root = lxml.html.fromstring(page) count = 0 for n in range(1, 12, 2): path = 'string(//body/center[%s]/h1/a)' % (n) comm_name = root.xpath(path) committee = Organization(chamber='lower', name=comm_name, classification='committee') count = count + 1 path2 = '/html/body/ul[%s]/li/a' % (count) for el in root.xpath(path2): rep = el.text if rep.find('(') != -1: mark = rep.find('(') rep = rep[15: mark].strip() if 'chair' in rep.lower(): role = 'chair' rep = re.sub(r'(?i)[\s,]*chair\s*$', '', rep).strip() else: role = 'member' committee.add_member(rep, role) committee.add_source(url) yield committee
def handle_page(self): name = self.doc.xpath('//h2[@class="committeeName"]')[0].text if name.startswith('Appropriations Subcommittee'): name = name.replace('Appropriations ', '') parent = {'name': 'Appropriations', 'classification': 'upper'} chamber = None else: if name.startswith('Committee on'): name = name.replace('Committee on ', '') parent = None chamber = 'upper' comm = Organization(name=name, classification="committee", chamber=chamber, parent_id=parent, ) for dt in self.doc.xpath('//div[@id="members"]/dl/dt'): role = dt.text.replace(': ', '').strip().lower() member = dt.xpath('./following-sibling::dd')[0].text_content() member = self.clean_name(member) comm.add_member(member, role=role) for ul in self.doc.xpath('//div[@id="members"]/ul/li'): member = self.clean_name(ul.text_content()) comm.add_member(member) comm.add_source(self.url) yield comm
def scrape_committees(self, repos): for repo in repos: source = "https://raw.githubusercontent.com/unitedstates/congress-legislators/master/{0}".format(repo) committees = self.fetch_yaml(source) for committee in committees: org = Organization(committee["name"], classification="committee") org.add_source(source) for key in committee.keys() & {"url", "rss_url"}: org.add_link(committee[key]) for key in committee.keys() & {"phone", "address"}: org.add_contact_detail( type="voice", value=committee[key] ) if key == "phone" else org.add_contact_detail(type=key, value=committee[key]) for key in committee.keys() & {"senate_committee_id", "house_committee_id", "thomas_id"}: org.add_identifier(committee[key], scheme=key) if "subcommittees" in committee: for subcommittee in committee["subcommittees"]: sub_org = Organization(subcommittee["name"], classification="committee", parent_id=org._id) sub_org.add_identifier(subcommittee["thomas_id"], scheme="thomas") sub_org.add_source(source) for key in subcommittee.keys() & {"phone", "address"}: sub_org.add_contact_detail( type="voice", value=committee[key] ) if key == "phone" else sub_org.add_contact_detail(type=key, value=committee[key]) yield sub_org yield org
def scrape(self): sessions = reversed(self.jurisdiction.legislative_sessions) committee_term_instances = committees_from_sessions(self, sessions) committees_by_code = build_lookup_dict(self, data_list=committee_term_instances, index_key='code') for code, instances in committees_by_code.items(): # TODO: Figure out how to edit city council org. if code == 'CC': continue extras = {'tmmis_decision_body_ids': []} for i, inst in enumerate(instances): # TODO: Ensure this survives addition of new term (2017) # so specific year always creates canonical_i = 0 if i == canonical_i: o = Organization(name=inst['name'], classification='committee') extras.update({'description': inst['info']}) o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME) extras['tmmis_decision_body_ids'].append({inst['term']: inst['decision_body_id']}) o.extras = extras o.add_source(inst['source_url']) if instances[canonical_i]['name'] != inst['name']: # TODO: Add start_date and end_date o.add_name(inst['name']) yield o
def scrape_interim_committee(self, link, name): url = re.sub(r'\s+', '', link.attrib['href']) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) if 'Subcommittee' in name: # Check whether the parent committee is manually defined first # before attempting to automatically resolve it. parent = WVCommitteeScraper.subcommittee_parent_map.get(name, None) if parent is None: parent = name.partition('Subcommittee')[0].strip() comm = Organization( name=name, classification='committee', parent_id={'name': parent, 'classification': 'joint'} ) else: comm = Organization(name=name, classification='committee', chamber='joint') comm.add_source(url) xpath = '//a[contains(@href, "?member=")]' for link in doc.xpath(xpath): name = link.text_content().strip() name = re.sub(r'^Delegate\s+', '', name) name = re.sub(r'^Senator\s+', '', name) role = link.getnext().text or 'member' comm.add_member(name, role.strip()) return comm
def scrape_committee(self, term, href, name): page = self.get(href).text page = lxml.html.fromstring(page) page.make_links_absolute(href) members = page.xpath("//div[@class='view-content']" "//a[contains(@href, 'members')]") if '/joint/' in href: chamber = 'legislature' elif '/senate/' in href: chamber = 'upper' elif '/house/' in href: chamber = 'lower' else: # interim committees and others were causing duplicate committee issues, skipping self.warning('Failed to identify chamber for {}; skipping'.format(href)) return cttie = Organization(name, chamber=chamber, classification='committee') for a in members: member = a.text role = a.xpath("ancestor::div/h2[@class='pane-title']/text()")[0].strip() role = {"Legislative Members": "member", "Chairman": "chair", "Vice Chairman": "member"}[role] if member is None or member.startswith("District"): continue member = member.replace('Senator ', '').replace('Representative ', '') cttie.add_member(member, role=role) cttie.add_source(href) yield cttie
def scrape_committee(self, name, url, chamber): org = Organization(name=name, chamber=chamber, classification='committee') org.add_source(url) data = self.get(url).text doc = lxml.html.fromstring(data) for leg in doc.xpath('//div[@id="members"]/div[@id="members"]/p/a/text()'): leg = leg.replace('Representative ', '') leg = leg.replace('Senator ', '') leg = leg.strip() if ' (' in leg: leg, role = leg.split(' (') if 'Vice-Chair' in role: role = 'vice-chair' elif 'Co-Chair' in role: role = 'co-chair' elif 'Chair' in role: role = 'chair' else: raise Exception('unknown role: %s' % role) else: role = 'member' org.add_member(leg, role) return org
def scrape_page(self, link, chamber=None): page = self.lxmlize(link.attrib['href']) comName = link.text roles = { "Chair": "chair", "Vice Chair": "vice-chair", "Vice-Chair": "vice-chair", } committee = Organization(comName, chamber=chamber, classification='committee') committee.add_source(link.attrib['href']) for member in page.xpath('//div[@class="members"]/' + 'div[@class="roster-item"]'): details = member.xpath('.//div[@class="member-details"]')[0] person = details.xpath('./h4')[0].text_content() # This page does random weird things with whitepace to names person = ' '.join(person.strip().split()) if not person: continue role = details.xpath('./span[@class="member-role"]') if role: role = roles[role[0].text] else: role = 'member' committee.add_member(person, role=role) yield committee
def scrape(self, chamber=None): base_url = ('http://www.ncga.state.nc.us/gascripts/Committees/' 'Committees.asp?bPrintable=true&sAction=ViewCommitteeType&sActionDetails=') chamber_slugs = {'upper': ['Senate%20Standing', 'Senate%20Select'], 'lower': ['House%20Standing', 'House%20Select']} if chamber: chambers = [chamber] else: chambers = ['upper', 'lower'] for chamber in chambers: for ctype in chamber_slugs[chamber]: data = self.get(base_url + ctype).text doc = lxml.html.fromstring(data) doc.make_links_absolute(base_url + ctype) for comm in doc.xpath('//ul/li/a'): name = comm.text # skip committee of whole Senate if 'Whole Senate' in name: continue url = comm.get('href') committee = Organization(name=name, chamber=chamber, classification="committee") self.scrape_committee(committee, url) committee.add_source(url) if not committee._related: self.warning('empty committee: %s', name) else: yield committee
def scrape_homepage(self, leg, chamber, homepage): page = self.get(homepage).text page = lxml.html.fromstring(page) page.make_links_absolute(homepage) bio = page.xpath( "//div[@class='biography']//div[@class='right']//p/text()") if bio != []: bio = bio[0] leg.extras['biography'] = bio fax_line = [ x.strip() for x in page.xpath( "//div[@class='contactModule']/div[@class='data']/text()" ) if "Fax" in x ] if fax_line: fax_number = re.search( r'(\(\d{3}\)\s\d{3}\-\d{4})', fax_line[0] ).group(1) leg.add_contact_detail(type='fax', value=fax_number, note='Capitol Office') ctties = page.xpath("//div[@class='committeeList']//a") for a in ctties: entry = a.text_content() if entry in committee_cache: committee_positions = committee_cache[entry] else: committee_positions = self.fetch_committee_positions(a) committee_cache[entry] = committee_positions chmbr = "legislature" if "joint" in entry.lower() else chamber if entry in JOINT_COMMITTEE_OVERRIDE: chmbr = "legislature" kwargs = {} if "subcommittee" in entry.lower(): if entry in SUBCOMMITTEES: kwargs['subcommittee'] = entry entry = SUBCOMMITTEES[entry] else: self.warning("No subcommittee known: '%s'" % (entry)) raise Exception if (chmbr, entry) not in self.committees: org = Organization( name=entry, chamber=chmbr, classification='committee', ) self.committees[(chmbr, entry)] = org else: org = self.committees[(chmbr, entry)] org.add_source(homepage) leg.add_membership(org)
def scrape_committees_pdf(self, year, chamber, filename, url): if chamber == 'lower' and year == '2015': text = self._fix_house_text(filename).decode() else: text = convert_pdf(filename, type='text-nolayout').decode() for hotgarbage, replacement in ( (r'Judicial Branch, Law Enforcement,\s+and\s+Justice', 'Judicial Branch, Law Enforcement, and Justice'), (r'Natural Resources and\s+Transportation', 'Natural Resources and Transportation'), (r'(?u)Federal Relations, Energy,?\s+and\s+Telecommunications', 'Federal Relations, Energy, and Telecommunications') ): text = re.sub(hotgarbage, replacement, text) lines = iter(text.splitlines()) # Drop any lines before the ag committee. lines = dropwhile(lambda s: 'Agriculture' not in s, lines) comm = None for line in lines: # Replace Unicode variants with ASCII equivalents line = line.replace(" ", " ").replace("‐", "-") if 'Subcommittees' in line: self.warning("Currently, we're skipping subcommittees") # https://github.com/openstates/openstates/issues/2099 break if is_committee_name(line): if comm and comm._related: yield comm committee = line.strip() comm = Organization(name=committee, chamber=chamber, classification='committee') comm.add_source(url) elif is_legislator_name(line): name, party = line.rsplit('(', 1) name = name.strip().replace("Rep. ", "").replace("Sen. ", "") if re.search(' Ch', party): role = 'chair' elif ' VCh' in party: role = 'vice chair' elif ' MVCh' in party: role = 'minority vice chair' else: role = 'member' comm.add_member(name, role) if comm._related: yield comm
def scrape_comms(self, chamber, ctype): for a in self.scrape_comm_list(ctype): link = a.attrib['href'] commName = clean(a.text_content()) self.info("url " + link) c = Organization(chamber=chamber, name=commName, classification='committee') self.add_members(c, link) c.add_source(link) yield c
def scrape(self, session=None): if session is None: session = self.latest_session() self.info('no session specified, using %s', session) # com_types = ['J', 'SE', 'O'] # base_url = 'https://wyoleg.gov/LsoService/api/committeeList/2018/J' url = 'https://wyoleg.gov/LsoService/api/committees/{}'.format(session) response = self.get(url) coms_json = json.loads(response.content.decode('utf-8')) for row in coms_json: com_url = 'https://wyoleg.gov/LsoService/api/committeeDetail/{}/{}'.format( session, row['ownerID']) com_response = self.get(com_url) com = json.loads(com_response.content.decode('utf-8')) # WY doesn't seem to have any house/senate only committees that I can find committee = Organization( name=com['commName'], chamber='legislature', classification='committee') for member in com['commMembers']: role = 'chairman' if member['chairman'] == 'Chairman' else 'member' committee.add_member(member['name'], role) # some WY committees have non-legislators appointed to the member by the Governor # but the formatting is super inconsistent if com['otherMembers']: committee.extras['other_members'] = com['otherMembers'] committee.extras['wy_id'] = com['commID'] committee.extras['wy_code'] = com['ownerID'] committee.extras['wy_type_code'] = com['type'] committee.extras['budget'] = com['budget'] if com['statAuthority']: committee.extras['statutory_authority'] = com['statAuthority'] if com['number']: committee.extras['seat_distribution'] = com['number'] committee.add_identifier( scheme='WY Committee ID', identifier=str(com['commID'])) committee.add_identifier( scheme='WY Committee Code', identifier=str(com['ownerID'])) if com['description']: committee.add_identifier( scheme='Common Name', identifier=com['description']) source_url = 'http://wyoleg.gov/Committees/{}/{}'.format( session, com['ownerID']) committee.add_source(source_url) yield committee
def scrape_upper_committee(self, committee_name, url): page = self.lxmlize(url) committee = Organization(chamber="upper", name=committee_name, classification="committee") committee.add_source(url) # Committee member attributes. member_name = None member_role = None # Attempt to record the committee chair. committee_chair = self.get_node( page, '//div[@class="nys-senator" and div[@class="nys-senator--info"' ' and p[@class="nys-senator--title" and' ' normalize-space(text())="Chair"]]]', ) if committee_chair is not None: info_node = self.get_node( committee_chair, 'div[@class="nys-senator--info" and p[@class=' '"nys-senator--title" and contains(text(), "Chair")]]', ) if info_node is not None: # Attempt to retrieve committee chair's name. member_name_text = self.get_node( info_node, './h4[@class="nys-senator--name"][1]/a[1]/text()') if member_name_text is not None: member_name = member_name_text.strip() else: warning = ( "Could not find the name of the chair for the {} committee" ) self.logger.warning(warning.format(committee_name)) # Attempt to retrieve committee chair's role (explicitly). member_role_text = self.get_node( info_node, './p[@class="nys-senator--title" and contains(text(), ' '"Chair")][1]/text()', ) if member_role_text is not None: member_role = member_role_text.strip() else: # This seems like a silly case, but could still be useful # to check for. warning = ( "Could not find the role of the chair for the {} committee" ) self.logger.warning(warning.format(committee_name)) if member_name is not None and member_role is not None: committee.add_member(member_name, member_role) else: warning = ( "Could not find information for the chair of the {} committee." ) self.logger.warning(warning.format(committee_name)) else: warning = "Missing chairperson for the {} committee." self.logger.warning(warning.format(committee_name)) # Get list of regular committee members. member_nodes = self.get_nodes( page, '//div[contains(concat(" ", @class, " "), ' '" c-senators-container ")]//div[@class="view-content"]/' " div/a", ) # Attempt to record each committee member. for member_node in member_nodes: member_name = None member_name_text = self.get_node( member_node, './/div[@class="nys-senator--info"][1]/h4[@class=' '"nys-senator--name"][1]/text()', ) if member_name_text is not None: member_name = member_name_text.strip() if member_name is not None: committee.add_member(member_name, "member") else: warning = "Could not find the name of a member in the {} committee" self.logger.warning(warning.format(committee_name)) return committee
def get_organizations(self): org = Organization(name="Board of Directors", classification="legislature") org.add_post( 'Mayor of the City of Los Angeles', 'Board Member', division_id='ocd-division/country:us/state:ca/place:los_angeles') for district in range(1, 6): org.add_post( 'Los Angeles County Board Supervisor, District {}'.format( district), 'Board Member', division_id= 'ocd-division/country:us/state:ca/county:los_angeles/council_district:{}' .format(district)) org.add_post( 'Appointee of Mayor of the City of Los Angeles', 'Board Member', division_id='ocd-division/country:us/state:ca/place:los_angeles') org.add_post('Appointee of Governor of California', 'Nonvoting Board Member', division_id='ocd-division/country:us/state:ca') org.add_post( 'Appointee of Los Angeles County City Selection Committee, North County/San Fernando Valley sector', 'Board Member', division_id= 'ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:north_county_san_fernando_valley' ) org.add_post( 'Appointee of Los Angeles County City Selection Committee, Southwest Corridor sector', 'Board Member', division_id= 'ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:southwest_corridor' ) org.add_post( 'Appointee of Los Angeles County City Selection Committee, San Gabriel Valley sector', 'Board Member', division_id= 'ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:san_gabriel_valley' ) org.add_post( 'Appointee of Los Angeles County City Selection Committee, Southeast Long Beach sector', 'Board Member', division_id= 'ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:southeast_long_beach' ) org.add_post('Chair', 'Chair') org.add_post('1st Vice Chair', '1st Vice Chair') org.add_post('2nd Vice Chair', '2nd Vice Chair') yield org org = Organization(name="Crenshaw Project Corporation", classification="corporation") org.add_source('foo') yield org org = Organization(name="LA SAFE", classification="corporation") org.add_source('foo') yield org
def scrape(self): committee_d = {} non_committees = ('City Council', 'Office of the Mayor') for councilman, committees in self.councilMembers(): if councilman['Ward/Office'] == "": continue ward = councilman['Ward/Office'] if ward not in [ "Mayor", "Clerk", ]: ward = "Ward {}".format(int(ward)) p = Person(councilman['Person Name']['label'], district=ward, primary_org="legislature") if councilman['Photo']: p.image = councilman['Photo'] contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if councilman[contact_type]: p.add_contact_detail(type=type_, value=councilman[contact_type], note=_note) if councilman["E-mail"]: p.add_contact_detail(type="email", value=councilman['E-mail']['label'], note='E-mail') if councilman['Website']: p.add_link(councilman['Website']['url']) p.add_source(MEMBERLIST) for committee, _, _ in committees: committee_name = committee['Legislative Body']['label'] if committee_name and committee_name not in non_committees: o = committee_d.get(committee_name, None) if o is None: o = Organization(committee_name, classification='committee') o.add_source( "https://chicago.legistar.com/Departments.aspx") committee_d[committee_name] = o o.add_member(p, role=committee["Title"]) yield p for o in committee_d.values(): yield o
def _scrape_lower_chamber(self, session): self.info('Scraping lower chamber for committees.') chamber = 'lower' url = '{base}CommitteeHierarchy.aspx'.format(base=self._reps_url_base) page_string = self.get(url).text page = lxml.html.fromstring(page_string) # Last tr has the date committee_links = page.xpath('//li//a') for committee_link in committee_links: committee_name = committee_link.text_content().strip() committee_url = committee_link.attrib.get('href') committee_url = '{base}{members}{url}'.format( base=self._reps_url_base, members= "MemberGridCluster.aspx?filter=compage&category=committee&", url=committee_url) actual_chamber = chamber if 'joint' in committee_name.lower(): actual_chamber = 'legislature' committee_name = committee_name.replace('Committee On ', '') committee_name = committee_name.replace('Special', '') committee_name = committee_name.replace('Select', '') committee_name = committee_name.replace('Special', '') committee_name = committee_name.replace('Joint', '') committee_name = committee_name.replace(' Committee', '') committee_name = committee_name.strip() committee = Organization( committee_name, chamber=actual_chamber, classification='committee', ) committee_page_string = self.get(committee_url).text committee_page = lxml.html.fromstring(committee_page_string) # First tr has the title (sigh) mem_trs = committee_page.xpath( "//table[@id='gvMembers_DXMainTable']//tr[contains(@class, 'dxgvDataRow')]" ) for mem_tr in mem_trs: mem_code = None mem_links = mem_tr.xpath('td/a[1]') mem_role_string = mem_tr.xpath( 'td[4]')[0].text_content().strip() if len(mem_links): mem_code = mem_links[0].attrib.get('href') # Output is "Rubble, Barney, Neighbor" mem_parts = mem_tr.xpath( 'td[2]')[0].text_content().strip().split(',') if self._no_members_text in mem_parts: continue mem_name = (mem_parts[1].strip() + ' ' + mem_parts[0].strip()) # Sometimes Senator abbreviation is in the name mem_name = mem_name.replace('Sen. ', '') mem_name = mem_name.replace('Rep. ', '') mem_role = 'member' if len(mem_role_string) > 2: mem_role = mem_role_string.lower() membership = committee.add_member(mem_name, role=mem_role) membership.extras = {'code': mem_code} committee.add_source(url) committee.add_source(committee_url) yield committee
def _scrape_upper_chamber(self, session): self.info('Scraping upper chamber for committees.') chamber = 'upper' if self._is_post_2015 and self.latest_session() != session: url = '{base}{year}web/standing-committees'.format( base=self._senate_url_base, year=session[2:]) comm_container_id = 'primary' elif session == self.latest_session(): url = '{base}standing-committees'.format( base=self._senate_url_base) comm_container_id = 'primary' else: url = '{base}{year}info/com-standing.htm'.format( base=self._senate_url_base, year=session[2:]) comm_container_id = 'mainContent' page = self.lxmlize(url) comm_links = self.get_nodes( page, '//div[@id = "{}"]//p/a'.format(comm_container_id)) for comm_link in comm_links: # Normalize to uppercase - varies between "Assigned bills" and "Assigned Bills" if "ASSIGNED BILLS" in comm_link.text_content().upper(): continue comm_link = comm_link.attrib['href'] if self._is_post_2015: if "web" not in comm_link: continue else: if "comm" not in comm_link: continue comm_page = self.lxmlize(comm_link) if self._is_post_2015: comm_name = self.get_node(comm_page, '//h1[@class="entry-title"]/text()') members = self.get_nodes( comm_page, '//div[@id="bwg_standart_thumbnails_0"]/a') else: comm_name = self.get_node(comm_page, '//div[@id="mainContent"]/p/text()') members = self.get_nodes(comm_page, '//div[@id="mainContent"]//td/a') comm_name = comm_name.replace(' Committee', '') comm_name = comm_name.strip() committee = Organization(comm_name, chamber=chamber, classification='committee') for member in members: mem_link = member.attrib.get("href", '') if "mem" not in mem_link: continue if self._is_post_2015: mem_parts = self.get_node( member, './/span[@class="bwg_title_spun2_0"]') mem_parts = member.text_content().strip().split(',') # Senator title stripping mainly for post-2015. mem_name = re.sub(r'^Senator[\s]+', '', mem_parts[0]) # this one time, MO forgot the comma between # the member and his district. Very rarely relevant try: int(mem_name[-4:-2] ) # the district's # is in this position except ValueError: pass else: mem_name = " ".join( mem_name.split(" ")[0:-1]) # member name fixed # ok, so this next line. We don't care about # the first 2 elements of mem_parts anymore # so whatever. But if the member as a role, we want # to make sure there are 3 elements in mem_parts and # the last one is actually the role. This sucks, sorry. mem_parts.append(mem_parts[-1]) mem_role = 'member' if len(mem_parts) > 2: mem_role = mem_parts[2].lower().split(' ')[0].strip() if mem_name == "": continue committee.add_member(mem_name, role=mem_role) committee.add_source(url) committee.add_source(comm_link) yield committee
def scrape(self): organizations = {} seat_numbers = defaultdict(lambda: defaultdict(int)) reader = self.csv_reader(self.csv_url, delimiter=self.delimiter, header=True, encoding=self.encoding, skip_rows=self.skip_rows) reader.fieldnames = [ self.header_converter(field) for field in reader.fieldnames ] for row in reader: try: if self.is_valid_row(row): for key, corrections in self.corrections.items(): if not isinstance(corrections, dict): row[key] = corrections(row[key]) elif row[key] in corrections: row[key] = corrections[row[key]] organization_classification = 'legislature' organization_name = row['organization'] organization_key = organization_name.lower() if organization_key in organizations: organization = organizations[organization_key] else: organization = Organization( organization_name, classification=organization_classification) organization.add_source(self.csv_url) yield organization organizations[organization_key] = organization if not row['primary role']: row['primary role'] = 'Councillor' role = row['primary role'] post = Post(role=role, label=organization_name, organization_id=organization._id) yield post name = row['name'].strip(' .,') district = row['district name'] if self.many_posts_per_area and role not in self.unique_roles: seat_numbers[role][district] += 1 district = '{} (seat {})'.format( district, seat_numbers[role][district]) p = Person(primary_org=organization_classification, name=name, district=district, role=role, party=row.get('party name')) p.add_source(self.csv_url) if row.get('gender'): p.gender = row['gender'] if row.get('photo url'): p.image = row['photo url'] if row.get('source url'): p.add_source(row['source url'].strip(' .,')) if row.get('website'): p.add_link(row['website'], note='web site') if row.get('facebook'): p.add_link(re.sub(r'[#?].+', '', row['facebook'])) if row.get('twitter'): p.add_link(row['twitter']) if row['email']: p.add_contact('email', row['email'].strip(' .,')) if row['address']: p.add_contact('address', row['address'], 'legislature') if row.get('phone'): p.add_contact('voice', row['phone'], 'legislature') if row.get('fax'): p.add_contact('fax', row['fax'], 'legislature') if row.get('cell'): p.add_contact('cell', row['cell'], 'legislature') if row.get('birth date'): p.birth_date = row['birth date'] if row.get('incumbent'): p.extras['incumbent'] = row['incumbent'] if name in self.other_names: for other_name in self.other_names[name]: p.add_name(other_name) # Validate person entity so that we can catch the exception if needed. p.validate() yield p except Exception as e: print(repr(e)) continue
def scrape(self): ''' Scrape the web to create a dict with all active organizations. Then, we can access the correct URL for the organization detail page. ''' web_scraper = LegistarPersonScraper( requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = 'https://metro.legistar.com/People.aspx' web_info = {} for _, organizations in web_scraper.councilMembers(): for organization, _, _ in organizations: organization_name = organization['Department Name'][ 'label'].strip() organization_info = organization['Department Name'] web_info[organization_name] = organization_info body_types = self.body_types() board_of_directors, = [ body for body in self.bodies() if body['BodyName'] == 'Board of Directors - Regular Board Meeting' ] board_of_directors["BodyName"] = "Board of Directors" terms = collections.defaultdict(list) for office in self.body_offices(board_of_directors): terms[office['OfficeRecordFullName']].append(office) members = {} for member, offices in terms.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] if role not in {'Board Member', 'non-voting member'}: p.add_term( role, 'legislature', start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate']), appointment=True) if role != 'Chief Executive Officer': if role == 'non-voting member': member_type = 'Nonvoting Board Member' post = NONVOTING_POSTS.get(member) else: member_type = 'Board Member' post = VOTING_POSTS.get(member) start_date = self.toDate(term['OfficeRecordStartDate']) end_date = self.toDate(term['OfficeRecordEndDate']) board_membership = p.add_term(member_type, 'legislature', district=post, start_date=start_date, end_date=end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get( p.name) if acting_member_end_date and acting_member_end_date <= end_date: board_membership.extras = {'acting': 'true'} source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] in ( body_types['Committee'], body_types['Independent Taxpayer Oversight Committee']): organization_name = body['BodyName'].strip() o = Organization(organization_name, classification='committee', parent_id={'name': 'Board of Directors'}) organization_info = web_info.get(organization_name, {}) organization_url = organization_info.get( 'url', self.WEB_URL + 'https://metro.legistar.com/Departments.aspx') o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(organization_url, note='web') for office in self.body_offices(body): role = office['OfficeRecordTitle'] if role not in ("Chair", "Vice Chair", "Chief Executive Officer"): if role == 'non-voting member': role = 'Nonvoting Member' else: role = 'Member' person = office['OfficeRecordFullName'] if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p start_date = self.toDate(office['OfficeRecordStartDate']) end_date = self.toDate(office['OfficeRecordEndDate']) membership = p.add_membership(organization_name, role=role, start_date=start_date, end_date=end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get( p.name) if acting_member_end_date and acting_member_end_date <= end_date: membership.extras = {'acting': 'true'} yield o for p in members.values(): yield p
def scrape(self): body_types = self.body_types() board_of_directors, = [ body for body in self.bodies() if body['BodyName'] == 'Board of Directors - Regular Board Meeting' ] board_of_directors["BodyName"] = "Board of Directors" terms = collections.defaultdict(list) for office in self.body_offices(board_of_directors): terms[office['OfficeRecordFullName']].append(office) members = {} for member, offices in terms.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] if role not in {'Board Member', 'non-voting member'}: p.add_term( role, 'legislature', start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) if role != 'Chief Executive Officer': if role == 'non-voting member': member_type = 'Nonvoting Board Member' post = NONVOTING_POSTS.get(member) else: member_type = 'Board Member' post = VOTING_POSTS.get(member) p.add_term( member_type, 'legislature', district=post, start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) source_urls = self._person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name': 'Board of Directors'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') for office in self.body_offices(body): role = office['OfficeRecordTitle'] if role not in ("Chair", "Vice Chair"): role = 'Member' person = office['OfficeRecordFullName'] if person in members: p = members[person] else: p = Person(person) source_urls = self._person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(body['BodyName'], role=role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) yield o for p in members.values(): yield p
def scrape(self): response = urlopen(COUNCIL_PAGE).read() pdf = open('/tmp/yt.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output( ['pdftotext', '-layout', '/tmp/yt.pdf', '-']) data = re.split(r'\n\s*\n', data) for municipality in data: if 'Councillors' not in municipality: continue lines = municipality.split('\n') if 'Page' in lines[0]: lines.pop(0) if not lines[0].strip(): lines.pop(0) col1end = re.search(r'\s{2,}(\w)', lines[0].strip()).end() col2end = re.search(r':\s{2,}(\w)', lines[0].strip()).end() if 'Council' in lines[1]: address = lines[2][:col1end - 1].strip() + ' ' + lines[3][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() + ' ' + lines[1][:col1end - 1].strip() else: address = lines[1][:col1end - 1].strip() + ' ' + lines[2][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() organization = Organization( name=district + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) organization.add_source(COUNCIL_PAGE) yield organization phone = re.findall(r'(?<=Phone: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') email = re.findall(r'(?<=E-mail:) (\S*)', municipality)[0] fax = None if 'Fax' in municipality: fax = re.findall(r'(?<=Fax: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') website = None if 'Website' in municipality: website = re.findall(r'((http:\/\/|www.)(\S*))', municipality)[0][0] councillor_or_mayor = False for line in lines: if 'Mayor:' in line: councillor_or_mayor = True role = 'Mayor' continue if 'Councillors' in line: councillor_or_mayor = True role = 'Councillor' continue if councillor_or_mayor: councillor = line[col1end - 1:col2end - 1].strip() if not councillor: continue p = Person(primary_org='legislature', name=councillor, district=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(organization, role=role, district=district) membership.add_contact_detail('address', address, 'legislature') membership.add_contact_detail('voice', phone, 'legislature') membership.add_contact_detail('email', email) if fax: membership.add_contact_detail('fax', fax, 'legislature') if website: p.add_link(website) yield p os.system('rm /tmp/yt.pdf')
def get_organizations(self): legislature = Organization("United States Congress", classification='legislature') self._legislature = legislature yield legislature senate = Organization( name="United States Senate", classification='upper', parent_id=legislature._id, ) self._senate = senate yield senate house = Organization( name="United States House", classification='lower', parent_id=legislature._id, ) self._house = house yield house sopr = Organization( name="Office of Public Record, US Senate", classification="office", parent_id=senate._id, ) sopr.add_contact_detail(type="voice", value="202-224-0322") sopr.add_source(url="http://www.senate.gov/pagelayout/legislative/" "one_item_and_teasers/opr.htm", note="Profile page") sopr.add_source(url="http://www.senate.gov/pagelayout/legislative/" "g_three_sections_with_teasers/lobbyingdisc.htm" "#lobbyingdisc=lda", note="Disclosure Home") sopr.add_link(url="http://soprweb.senate.gov/index.cfm" "?event=selectfields", note="Disclosure Search Portal") sopr.add_link(url="http://soprweb.senate.gov/", note="Disclosure Electronic Filing System") self._sopr = sopr yield sopr house_clerk = Organization( name="Office of the Clerk, US House", classification="office", parent_id=house._id, ) house_clerk.add_contact_detail(type="voice", value="202-225-7000") house_clerk.add_source(url="http://clerk.house.gov/", note="Home page") self._house_clerk = house_clerk yield house_clerk yield legislature
def get_organizations(self): global date_range city = Organization('City of Saint Paul', classification='executive') city.add_post( 'Mayor', 'Mayor', division_id='ocd-division/country:us/state:mn/place:st_paul') city.add_post( 'City Clerk', 'City Clerk', division_id='ocd-division/country:us/state:mn/place:st_paul') yield city council = Organization(name="Saint Paul City Council", classification="legislature", parent_id=city) for x in range(1, 8): council.add_post( "Ward {}".format(x), "Councilmember", division_id= 'ocd-division/country:us/state:mn/place:st_paul/ward:{}'. format(x)) yield council carter = Person(name="Melvin Carter") carter.add_term('Mayor', 'executive', start_date=dtdate(2018, 1, 19), appointment=True) carter.add_source('http://www.google.com') yield carter new_meetings = [] temp_labels = [] for date in date_range: print('Checking date:', date) root = requests.get("https://www.stpaul.gov/calendar/" + date) base = html.fromstring(root.text) items = base.xpath('.//*/div[@class="view-content"]/div') meetings = [] for i in items: if len( i.xpath( './/*/span[@class="date-display-single"]/text()') ) > 0: d = {} d['date'] = i.xpath( './/*/span[@class="date-display-single"]/text()')[0] d['info'] = i.xpath( './/*/span[@class="field-content"]/a/text()')[0] d['link'] = i.xpath( './/*/span[@class="field-content"]/a/@href')[0] meetings.append(d) for m in meetings: m['link'] = "https://www.stpaul.gov" + m['link'] for m in meetings: r = requests.get(m['link']) b = html.fromstring(r.text) exists = b.xpath('.//div[@class="node-content clearfix"]') if len(exists) > 0: if not 'City Council' in m[ 'info'] and not 'Legislative' in m[ 'info'] and not 'Holiday' in m['info']: m['name'] = m['info'].replace('Meeting', '').replace( ' - Cancelled', '').replace('Events', '').strip() if not m['name'] in temp_labels: temp_labels.append(m['name']) new_meetings.append(m) print('Creating organizations') for m in new_meetings: print(m) cmt = Organization(name=m['name'], classification='committee', parent_id=city) cmt.add_source(m['link']) yield cmt
def scrape(self): ''' Scrape the web to create a dict with all active organizations. Then, we can access the correct URL for the organization detail page. ''' web_scraper = LegistarPersonScraper( requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = 'https://metro.legistar.com/People.aspx' web_info = {} for _, organizations in web_scraper.councilMembers(): for organization, _, _ in organizations: organization_name = organization['Department Name'][ 'label'].strip() organization_info = organization['Department Name'] web_info[organization_name] = organization_info body_types = self.body_types() board_of_directors, = [ body for body in self.bodies() if body['BodyName'] == 'Board of Directors - Regular Board Meeting' ] board_of_directors["BodyName"] = "Board of Directors" terms = collections.defaultdict(list) for office in self.body_offices(board_of_directors): terms[office['OfficeRecordFullName']].append(office) members = {} for member, offices in terms.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] if role not in {'Board Member', 'non-voting member'}: p.add_term( role, 'legislature', start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate']), appointment=True) if role != 'Chief Executive Officer': if role == 'non-voting member': member_type = 'Nonvoting Board Member' post = NONVOTING_POSTS.get(member) else: member_type = 'Board Member' post = VOTING_POSTS.get(member) start_date = self.toDate(term['OfficeRecordStartDate']) end_date = self.toDate(term['OfficeRecordEndDate']) board_membership = p.add_term(member_type, 'legislature', district=post, start_date=start_date, end_date=end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get( p.name) if acting_member_end_date and acting_member_end_date <= end_date: board_membership.extras = {'acting': 'true'} # Each term contains first and last names. This should be the same # across all of a person's terms, so go ahead and grab them from the # last term in the array. p.family_name = term['OfficeRecordLastName'] p.given_name = term['OfficeRecordFirstName'] # Defensively assert that the given and family names match the # expected value. if member == 'Hilda L. Solis': # Given/family name does not contain middle initial. assert p.given_name == 'Hilda' and p.family_name == 'Solis' else: assert member == ' '.join([p.given_name, p.family_name]) source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] in ( body_types['Committee'], body_types['Independent Taxpayer Oversight Committee']): organization_name = body['BodyName'].strip() o = Organization(organization_name, classification='committee', parent_id={'name': 'Board of Directors'}) organization_info = web_info.get(organization_name, {}) organization_url = organization_info.get( 'url', self.WEB_URL + 'https://metro.legistar.com/Departments.aspx') o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(organization_url, note='web') for office in self.body_offices(body): role = office['OfficeRecordTitle'] if role not in BOARD_OFFICE_ROLES: if role == 'non-voting member': role = 'Nonvoting Member' else: role = 'Member' person = office['OfficeRecordFullName'] # Temporarily skip committee memberships, e.g., for # new board members. The content of this array is provided # by Metro. if person in PENDING_COMMITTEE_MEMBERS: self.warning('Skipping {0} membership for {1}'.format( organization_name, person)) continue if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p start_date = self.toDate(office['OfficeRecordStartDate']) end_date = self.toDate(office['OfficeRecordEndDate']) membership = p.add_membership(organization_name, role=role, start_date=start_date, end_date=end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get( p.name) if acting_member_end_date and acting_member_end_date <= end_date: membership.extras = {'acting': 'true'} yield o for p in members.values(): yield p
def scrape(self): web_scraper = LegistarPersonScraper( requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody' if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member, _ in web_scraper.councilMembers(): name = member['Person Name']['label'].strip() web_info[name] = member city_council, = [ body for body in self.bodies() if body['BodyName'] == 'City Council' ] terms = collections.defaultdict(list) public_advocates = { # Match casing to Bill De Blasio as council member 'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio', 'The Public Advocate (Ms. James)': 'Letitia James', } for office in self.body_offices(city_council): name = office['OfficeRecordFullName'] name = public_advocates.get(name, name).strip() terms[name].append(office) # Add past members (and advocates public) if name not in web_info: web_info[name] = collections.defaultdict(lambda: None) # Check that we have everyone we expect, formatted consistently, in # both information arrays. For instance, this will fail if we forget to # strip trailing spaces from names on one side or the other (which has # the effect of omitting information, such as post, from the scrape). assert set(web_info.keys()) == set(terms.keys()) members = {} for member, offices in terms.items(): p = Person(member) web = web_info[member] for term in offices: role = term['OfficeRecordTitle'] if role == 'Public Advocate': role = 'Non-Voting Council Member' else: role = 'Council Member' district = web.get('District', '').replace(' 0', ' ') p.add_term(role, 'legislature', district=district, start_date=self.toDate( term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) party = web.get('Political Party') if party == 'Democrat': party = 'Democratic' if party: p.add_party(party) if web.get('Photo'): p.image = web['Photo'] contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if web.get(contact_type) and web(contact_type) != 'N/A': p.add_contact_detail(type=type_, value=web[contact_type], note=_note) if web.get('E-mail'): p.add_contact_detail(type="email", value=web['E-mail']['url'], note='E-mail') if web.get('Web site'): p.add_link(web['Web site']['url'], note='web site') if web.get('Notes'): p.extras = {'Notes': web['Notes']} if not p.sources: # Only add sources once source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p committee_types = [ 'Committee', 'Inactive Committee', 'Select Committee', 'Subcommittee', 'Task Force', 'Land Use' ] # Committee on Land Use body_types = { k: v for k, v in self.body_types().items() if k in committee_types } for body in self.bodies(): if body['BodyTypeName'] in body_types \ or body['BodyName'] in ('Legislative Documents Unit', 'Legal and Government Affairs Division'): # Skip typo in API data if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services': continue parent_org = PARENT_ORGS.get(body['BodyName'], 'New York City Council') body_name = body['BodyName'] o = Organization(body_name, classification='committee', parent_id={'name': parent_org}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') for office in self.body_offices(body): # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio', # 'Committee Member', None, 'CHAIRPERSON' role = office['OfficeRecordTitle'] if role and role.lower() == 'chairperson': role = 'Chairperson' else: role = 'Member' person = office['OfficeRecordFullName'] person = public_advocates.get(person, person).strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(o, role=role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) yield o for p in members.values(): yield p
def scrape(self): body_types = self.body_types() city_council, = [ body for body in self.bodies() if body['BodyName'] == 'City Council' ] terms = collections.defaultdict(list) for office in self.body_offices(city_council): if 'VACAN' not in office['OfficeRecordFullName']: terms[office['OfficeRecordFullName'].strip()].append(office) web_scraper = LegistarPersonScraper( requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = 'https://chicago.legistar.com/DepartmentDetail.aspx?ID=12357&GUID=4B24D5A9-FED0-4015-9154-6BFFFB2A8CB4&R=8bcbe788-98cd-4040-9086-b34fa8e49881' web_scraper.ALL_MEMBERS = '3:3' if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member, _ in web_scraper.councilMembers( {'ctl00$ContentPlaceHolder$lstName': 'City Council'}): web_info[member['Person Name']['label']] = member web_info['Balcer, James'] = collections.defaultdict(lambda: None) web_info['Fioretti, Bob'] = collections.defaultdict(lambda: None) web_info['Balcer, James']['Ward/Office'] = 11 web_info['Fioretti, Bob']['Ward/Office'] = 2 members = {} for member, offices in terms.items(): web = web_info[member] p = Person(member) for term in offices: role = term['OfficeRecordTitle'] p.add_term('Alderman', 'legislature', district="Ward {}".format(int(web['Ward/Office'])), start_date=self.toDate( term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) if web.get('Photo'): p.image = web['Photo'] contact_types = { "City Hall Address": ("address", "City Hall Address"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if web[contact_type] and web[contact_type] != 'N/A': p.add_contact_detail(type=type_, value=web[contact_type], note=_note) if web["E-mail"] and web["E-mail"][ "label"] and web["E-mail"]["label"] != 'N/A': p.add_contact_detail(type="email", value=web['E-mail']['label'], note='E-mail') if web['Website']: p.add_link(web['Website']['url']) source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name': 'Chicago City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') for office in self.body_offices(body): # messed up record for joanna thompson if office['OfficeRecordId'] == 1055: continue role = office['OfficeRecordTitle'] if role not in ("Vice Chair", "Chairman"): role = 'Member' person = office['OfficeRecordFullName'].strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(body['BodyName'], role=role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) yield o for body in self.bodies(): if body['BodyTypeId'] == body_types['Joint Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name': 'Chicago City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') yield o for p in members.values(): yield p
def scrape_committee(self, committee_id): old = self.api('committees/' + committee_id + '?') id = old.pop('id') old.pop('created_at') old.pop('updated_at') old.pop('country', None) old.pop('level', None) old.pop('state') old.pop('votesmart_id', None) old.pop('+short_name', None) old.pop('+session', None) old.pop('+az_committee_id', None) com = old.pop('committee') sub = old.pop('subcommittee') parent_id = old.pop('parent_id') chamber = old.pop('chamber') if chamber == 'joint': chamber = '' if self.state in ('ne', 'dc'): chamber = 'legislature' if sub: if parent_id: parent = self._committees[parent_id]._id new = Organization(sub, parent_id=parent, classification='committee') else: new = Organization(com + ': ' + sub, chamber=chamber, classification='committee') else: new = Organization(com, chamber=chamber, classification='committee') assert parent_id is None # all_ids for id in old.pop('all_ids'): new.add_identifier(id, scheme='openstates') self._committees[id] = new # sources for source in old.pop('sources'): new.add_source(**source) # members start, end = self.get_term_years() for role in old.pop('members'): # leg_id, com_id, role, start, end if role['leg_id']: self._roles.add((role['leg_id'], id, role['role'], start, end)) to_extras = [ '+twitter', '+description', '+code', '+secretary', '+office_hours', '+office_phone', '+meetings_info', '+status', '+aide', '+contact_info', '+comm_type', 'comm_type', 'aide', 'contact_info', '+town_represented', '+action_code', ] for k in to_extras: v = old.pop(k, None) if v: new.extras[k.replace('+', '')] = v assert not old, old.keys() return new
def scrape_chamber(self, chamber): committee_list_urls = { "lower": "https://capitol.texas.gov/Committees/" "CommitteesMbrs.aspx?Chamber=H", "upper": "https://capitol.texas.gov/Committees/" "CommitteesMbrs.aspx?Chamber=S", } committee_list_url = committee_list_urls[chamber] committee_list_page = self.lxmlize(committee_list_url) committee_nodes = self.get_nodes( committee_list_page, '//form[@id="ctl00"]//a[@id="CmteList"]') for committee_node in committee_nodes: committee_name = committee_node.text.strip() committee = Organization(name=committee_name, chamber=chamber, classification="committee") # Get the committee profile page. committee_page_url = committee_node.get("href") committee_page = self.lxmlize(committee_page_url) # Capture table with committee membership data. details_table = self.get_node(committee_page, '//div[@id="content"]//table[2]') if details_table is not None: # Skip the first row because it currently contains only headers detail_rows = self.get_nodes(details_table, "./tr")[1:] for detail_row in detail_rows: label_text = self.get_node(detail_row, "./td[1]//text()") if label_text: label_text = label_text.strip().rstrip(":") if label_text in ("Chair", "Vice Chair"): member_role = "chair" else: member_role = "member" member_name_text = self.get_node(detail_row, "./td[2]/a/text()") # Clean titles from member names. if chamber == "upper": member_name = re.sub(r"^Sen\.[\s]*", "", member_name_text) elif chamber == "lower": member_name = re.sub(r"^Rep\.[\s]*", "", member_name_text) # Collapse multiple whitespaces in member names. member_name = re.sub(r"[\s]{2,}", " ", member_name).strip() committee.add_member(member_name, member_role) committee.add_source(committee_list_url) committee.add_source(committee_page_url) yield committee
def scrape(self): noncommittees = {'Committee of the Whole'} committee_d = {} people_d = {} for councilman, committees in self.councilMembers() : if 'url' in councilman['Person Name'] : councilman_url = councilman['Person Name']['url'] if councilman_url in people_d : people_d[councilman_url][0].append(councilman) else : people_d[councilman_url] = [councilman], committees for person_entries, committees in people_d.values() : councilman = person_entries[-1] p = Person(councilman['Person Name']['label']) if p.name == 'Letitia James' : p.name = 'Letitia Ms. James' p.add_name('Letitia James') spans = [(self.toTime(entry['Start Date']).date(), self.toTime(entry['End Date']).date(), entry['District']) for entry in person_entries] merged_spans = [] last_end_date = None last_district = None for start_date, end_date, district in sorted(spans) : if last_end_date is None : span = [start_date, end_date, district] elif (start_date - last_end_date) == datetime.timedelta(1) and district == last_district : span[1] = end_date else : merged_spans.append(span) span = [start_date, end_date, district] last_end_date = end_date last_district = district merged_spans.append(span) for start_date, end_date, district in merged_spans : district = councilman['District'].replace(' 0', ' ') if end_date == datetime.date(2017, 12, 31) : end_date = '' else : end_date = end_date.isoformat() print(start_date, end_date) p.add_term('Council Member', 'legislature', district=district, start_date=start_date.isoformat(), end_date=end_date) party = councilman['Political Party'] if party == 'Democrat' : party = 'Democratic' if party : p.add_party(party) if councilman['Photo'] : p.image = councilman['Photo'] if councilman["E-mail"]: p.add_contact_detail(type="email", value=councilman['E-mail']['url'], note='E-mail') if councilman['Web site']: p.add_link(councilman['Web site']['url'], note='web site') p.extras = {'Notes' : councilman['Notes']} p.add_source(councilman['Person Name']['url'], note='web') for committee, _, _ in committees: committee_name = committee['Department Name']['label'] if committee_name not in noncommittees and 'committee' in committee_name.lower(): o = committee_d.get(committee_name, None) if o is None: parent_id = PARENT_ORGS.get(committee_name, 'New York City Council') o = Organization(committee_name, classification='committee', parent_id={'name' : parent_id}) o.add_source(committee['Department Name']['url']) committee_d[committee_name] = o membership = o.add_member(p, role=committee["Title"]) membership.start_date = self.mdY2Ymd(committee["Start Date"]) yield p for o in committee_d.values() : if 'Committee' in o.name : yield o for o in committee_d.values() : if 'Subcommittee' in o.name : yield o o = Organization('Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services', classification='committee', parent_id={'name' : 'New York City Council'}) o.add_source("http://legistar.council.nyc.gov/Departments.aspx") yield o o = Organization('Subcommittee on Drug Abuse', classification='committee', parent_id={'name' : 'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services'}) o.add_source("http://legistar.council.nyc.gov/Departments.aspx") yield o
def scrape(self): body_types = self.body_types() board_of_directors, = [ body for body in self.bodies() if body['BodyName'] == 'Board of Directors' ] members = {} for office in self.body_offices(board_of_directors): members.setdefault(office['OfficeRecordFullName'], []).append(office) for member, offices in members.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] if role != 'non-voting member': role = 'Board Member' post = VOTING_POSTS.get(member) else: role = 'Nonvoting Board Member' post = NONVOTING_POSTS.get(member) p.add_term(role, 'legislature', district=post, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate(office['OfficeRecordEndDate'])) legistar_api = self.BASE_URL + '/OfficeRecords/' p.add_source(legistar_api, note='api') print(p) yield p adjunct_members = {} for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name': 'Board of Directors'}) o.add_source(self.BASE_URL + '/Bodies/') for office in self.body_offices(body): role = office['OfficeRecordTitle'] if role not in ("Chair", "Vice Chair"): role = 'Member' person = office['OfficeRecordFullName'] if person not in members: if person not in adjunct_members: p = Person(person) p.add_source('foo') else: p = adjunct_members[person] p.add_membership(body['BodyName'], role=role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) adjunct_members[person] = p else: o.add_member(office['OfficeRecordFullName'], role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) yield o for p in adjunct_members.values(): yield p
def scrape_session(self, session, chambers): sid = SESSION_SITE_IDS[session] committees = backoff(self.cservice.GetCommitteesBySession, sid) # if committees.strip() == "": # return # If we get here, it's a problem. # Commenting this out for future debugging. - PRT if str(committees).strip() == "": raise ValueError("Error: No committee data for sid: %s" % (sid)) committees = committees['CommitteeListing'] for committee in committees: cid = committee['Id'] committee = backoff(self.cservice.GetCommittee, cid) subctty_cache = {} comname, typ, guid, code, description = [ committee[x] for x in ['Name', 'Type', 'Id', 'Code', 'Description'] ] comchamber = { "House": "lower", "Senate": "upper", "Joint": "joint" }[typ] ctty_key = '{}-{}'.format(typ, code) if ctty_key not in self.ctty_cache: ctty = Organization(chamber=comchamber, name=comname, classification='committee') ctty.extras = { 'code': code, 'guid': guid, 'description': description, } self.ctty_cache[ctty_key] = ctty members = committee['Members']['CommitteeMember'] for member in members: name = "{First} {Last}".format( **dict(member['Member']['Name'])) role = member['Role'] membership = ctty.add_member(name, role) membership.extras = {'guid': member['Member']['Id']} subcoms = member['SubCommittees'] or [] for subcom in subcoms: subcom = subcom[1][0] subguid = subcom['Id'] subcommittee = subcom['Name'] if subcommittee in subctty_cache: # Add member to existing subcommittee. subctty = subctty_cache[subcommittee] else: # Create subcommittee. subctty = Organization(name=subcommittee, classification='committee', parent_id={ 'classification': comchamber, 'name': comname }) subctty.extras = { 'guid': subguid, } subctty.add_source(self.csource) subctty.add_source( CTTIE_URL.format(**{ "sid": sid, "cttie": guid, })) subctty_cache[subcommittee] = subctty membership = subctty.add_member(name, role) membership.extras = {'guid': member['Member']['Id']} for subctty in subctty_cache.values(): yield subctty ctty.add_source(self.csource) ctty.add_source(CTTIE_URL.format(**{ "sid": sid, "cttie": guid, })) yield ctty
def scrape(self, session=None): if session is None: session = self.latest_session() year_slug = session[5:] # Load all committees via the private API committee_dump_url = \ 'http://legislature.vermont.gov/committee/loadList/{}/'.\ format(year_slug) json_data = self.get(committee_dump_url).text committees = json.loads(json_data)['data'] # Parse the information from each committee for info in committees: # Strip whitespace from strings info = {k: v.strip() for k, v in info.items()} # Determine the chamber if info['CommitteeType'] == 'House Standing': chamber = 'lower' elif info['CommitteeType'] == 'Senate Standing': chamber = 'upper' elif info['CommitteeType'] == 'Joint Committee': chamber = 'joint' elif info['CommitteeType'] in ('Study Committee', 'Commissions'): if info['CommitteeName'].startswith("House"): chamber = 'lower' elif info['CommitteeName'].startswith("Senate"): chamber = 'upper' else: chamber = 'joint' else: raise AssertionError( "Unknown committee type found: '{}'".format( info['CommitteeType'])) comm = Organization(name=info['CommitteeName'], chamber=chamber, classification='committee') # Determine membership and member roles # First, parse the member list and make sure it isn't a placeholder REMOVE_TAGS_RE = r'<.*?>' members = [ re.sub(REMOVE_TAGS_RE, '', x) for x in info['Members'].split('</br>') ] members = [x.strip() for x in members if x.strip()] for member in members: # Strip out titles, and exclude committee assistants if member.startswith("Rep. "): member = member[len("Rep. "):] elif member.startswith("Sen. "): member = member[len("Sen. "):] else: self.info("Non-legislator member found: {}".format(member)) # Determine the member's role in the committee if ',' in member: (member, role) = [x.strip() for x in member.split(',')] if 'jr' in role.lower() or 'sr' in role.lower(): raise AssertionError( "Name suffix confused for a committee role") else: role = 'member' comm.add_member(name_or_person=member, role=role) comm.add_source(committee_dump_url) yield comm
def scrape_committee(self, chamber, name, url, subcommittee=None): name = self._fix_committee_name(name) name = self._fix_committee_case(name) page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) # Get the subcommittee name. xpath = '//div[@class="ms-WPBody"]//table//tr/td/b/text()' if subcommittee: subcommittee = page.xpath(xpath) if subcommittee: subcommittee = page.xpath(xpath).pop(0) subcommittee = self._fix_committee_name( subcommittee, parent=name, subcommittee=True) subcommittee = self._fix_committee_case(subcommittee) else: subcommittee = None # Dedupe. if (chamber, name, subcommittee) in self._seen: return self._seen.add((chamber, name, subcommittee)) comm = Organization(chamber=chamber, name=name, classification='committee') comm.add_source(url) member_nodes = page.xpath('//table[@class="dxgvTable"]/tr') for member_node in member_nodes: # Skip empty rows. if member_node.attrib['class'] == 'dxgvEmptyDataRow': continue mtype = member_node.xpath('string(td[1])').strip() if not mtype: mtype = 'member' member = member_node.xpath('string(td[3])').split() member = ' '.join(member[1:]) comm.add_member(member, role=mtype) for a in page.xpath('//table[@id="ctl00_m_g_a194465c_f092_46df_b753_' '354150ac7dbd_ctl00_tblContainer"]//ul/li/a'): sub_name = a.text.strip() sub_url = a.get('href').replace('../', '/') self.scrape_committee(chamber, name, sub_url, subcommittee=sub_name) if not comm._related: if subcommittee: self.warning('Not saving empty subcommittee {}.'.format( subcommittee)) else: self.warning('Not saving empty committee {}.'.format(name)) else: yield comm
def scrape_committees_pdf(self, year, chamber, filename, url): if chamber == "lower" and year == "2015": text = self._fix_house_text(filename).decode() else: text = convert_pdf(filename, type="text-nolayout").decode() for hotgarbage, replacement in ( ( r"Judicial Branch, Law Enforcement,\s+and\s+Justice", "Judicial Branch, Law Enforcement, and Justice", ), ( r"Natural Resources and\s+Transportation", "Natural Resources and Transportation", ), ( r"(?u)Federal Relations, Energy,?\s+and\s+Telecommunications", "Federal Relations, Energy, and Telecommunications", ), ): text = re.sub(hotgarbage, replacement, text) lines = iter(text.splitlines()) # Drop any lines before the ag committee. lines = dropwhile(lambda s: "Agriculture" not in s, lines) comm = None for line in lines: # Replace Unicode variants with ASCII equivalents line = line.replace(" ", " ").replace("‐", "-") if "Subcommittees" in line: self.warning("Currently, we're skipping subcommittees") # https://github.com/openstates/openstates/issues/2099 break if is_committee_name(line): if comm and comm._related: yield comm committee = line.strip() comm = Organization( name=committee, chamber=chamber, classification="committee" ) comm.add_source(url) elif is_legislator_name(line): name, party = line.rsplit("(", 1) name = name.strip().replace("Rep. ", "").replace("Sen. ", "") if re.search(" Ch", party): role = "chair" elif " VCh" in party: role = "vice chair" elif " MVCh" in party: role = "minority vice chair" else: role = "member" comm.add_member(name, role) if comm._related: yield comm
def scrape(self): body_types = self.body_types() city_council, = [body for body in self.bodies() if body["BodyName"] == "City Council"] terms = collections.defaultdict(list) for office in self.body_offices(city_council): if "VACAN" not in office["OfficeRecordFullName"]: terms[office["OfficeRecordFullName"].strip()].append(office) web_scraper = LegistarPersonScraper(requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = "https://pittsburgh.legistar.com/People.aspx" web_scraper.COMMITTEELIST = "https://pittsburgh.legistar.com/Departments.aspx" if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member in web_scraper.councilMembers(): web_info[member["Person Name"]] = member members = {} for member, offices in terms.items(): person = Person(member) for term in offices: role = term["OfficeRecordTitle"] person.add_term("Councilmember", "legislature", start_date = self.toDate(term["OfficeRecordStartDate"]), end_date = self.toDate(term["OfficeRecordEndDate"])) if member in web_info: web = web_info[member] if web["E-mail"] and web["E-mail"]["label"] and web["E-mail"]["label"] != "N/A": person.add_contact_detail(type="email", value=web["E-mail"]["label"], note="E-mail") person_source_data = self.person_sources_from_office(term) person_api_url, person_api_response = person_source_data person.add_source(person_api_url, note="api") if person_api_response["PersonAddress1"]: address = (person_api_response["PersonAddress1"] + ", " + person_api_response["PersonCity1"] + ", " + person_api_response["PersonState1"] + " " + person_api_response["PersonZip1"]) person.add_contact_detail(type="address", value=address, note="Office address") if person_api_response["PersonPhone"]: person.add_contact_detail(type="voice", value=person_api_response["PersonPhone"], note="Office phone") if person_api_response["PersonWWW"]: person.add_contact_detail(type="url", value=person_api_response["PersonWWW"], note="District website") members[member] = person for body in self.bodies(): if body["BodyTypeId"] == body_types["Committee"]: body_name_clean = body["BodyName"].strip() organization = Organization(body_name_clean, classification="committee", parent_id={"name" : "Pittsburgh City Council"}) organization.add_source(self.BASE_URL + "/bodies/{BodyId}".format(**body), note="api") for office in self.body_offices(body): role = office["OfficeRecordMemberType"] if role not in ("Vice Chair", "Chair") or role == "Councilmember": role = "Member" person = office["OfficeRecordFullName"].strip() if person in members: person = members[person] else: person = Person(person) person.add_membership(body_name_clean, role=role, start_date = self.toDate(office["OfficeRecordStartDate"]), end_date = self.toDate(office["OfficeRecordEndDate"])) yield organization for person in members.values(): yield person
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) url = page.xpath( '//a[contains(text(),"Municipal Directory")]/@href')[0] response = urlopen(url).read() pdf = open('/tmp/nl.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output( ['pdftotext', '-layout', '/tmp/nl.pdf', '-']) pages = data.split('Municipal Directory')[1:] for page in pages: page = page.splitlines(True) column_index = {} for line in page: if 'Official Name' in line: column_index['dist_end'] = re.search('Region', line).start() column_index['name_start'] = re.search('Mayor', line).start() + 1 column_index['name_end'] = re.search('Clerk', line).start() - 1 column_index['phone_start'] = re.search('Line 1', line).start() column_index['phone_end'] = re.search('Line 2', line).start() - 1 column_index['fax_start'] = re.search('Fax', line).start() column_index['fax_end'] = re.search('E-mail', line).start() - 2 column_index['email_start'] = column_index['fax_end'] + 1 column_index['email_end'] = re.search('Address', line).start() - 1 column_index[ 'address_start'] = column_index['email_end'] + 1 column_index['address_end'] = re.search('Days', line).start() - 1 break for line in page: if 'Official Name' in line or not line.strip(): continue district = line[:column_index['dist_end']] name = line[column_index['name_start']: column_index['name_end']].strip() phone = line[column_index['phone_start']: column_index['phone_end']].strip().replace( '(', '').replace(') ', '-') # fax = line[column_index['fax_start']:column_index['fax_end']].strip().replace('(', '').replace(') ', '-') email = line[column_index['email_start']: column_index['email_end']].strip() address = line[column_index['address_start']: column_index['address_end']].strip() address = re.sub(r'\s{2,}', ', ', address) if not name or not district: continue org = Organization( name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) org.add_source(url) yield org p = Person(primary_org='legislature', name=name, district=district) p.add_source(COUNCIL_PAGE) p.add_source(url) membership = p.add_membership(org, role='Mayor', district=district) if phone: membership.add_contact_detail('voice', phone, 'legislature') # I'm excluding fax because that column isn't properly aligned # if fax: # membership.add_contact_detail('fax', fax) if email: membership.add_contact_detail('email', email) if address: membership.add_contact_detail('address', address, 'legislature') yield p os.system('rm /tmp/nl.pdf')
def scrape(self, chamber=None): if chamber: chambers = [chamber] else: chambers = ['upper', 'lower', 'legislature'] # Xpath query string format for legislative chamber committee urls base_xpath = ( '//table[@id="MainContent_gridView{0}Committees"]//a' '[contains(@id, "MainContent_gridView{1}Committees_link' '{2}Committee")]/@href') chamber_paths = { 'upper': {'url': '{}Senate_Standing'.format(base_url), 'chamber_xpath': base_xpath.format('Senate', 'Senate', 'Senate')}, 'lower': {'url': '{}House_Standing'.format(base_url), 'chamber_xpath': base_xpath.format('House', 'House', 'House')}, 'legislature': {'url': '{}Interim'.format(base_url), 'chamber_xpath': base_xpath.format('', '', '')} } for chamber in chambers: page = self.lxmlize(chamber_paths[chamber]['url']) committee_urls = self.get_nodes(page, chamber_paths[chamber]['chamber_xpath']) for committee_url in committee_urls: committee_page = self.lxmlize(committee_url) c_name = committee_page.xpath( '//li/a[contains(@id, "siteMapBreadcrumbs_lnkPage_")]')[ -1].text_content().strip() if c_name: members_xpath = ('//table[@id="MainContent_formView' 'CommitteeInformation_grid' 'ViewCommitteeMembers"]/tbody/tr') member_nodes = self.get_nodes(committee_page, members_xpath) tds = { 'title': 0, 'name': 1, 'role': 3 } members = [] for member_node in member_nodes: m_title = member_node[tds['title']].text_content() m_name = \ self.get_node( member_node[tds['name']], './/a[contains(@href, ' '"/Members/Legislator?SponCode=")]' ).text_content() role = member_node[tds['role']].text_content() if m_title == 'Senator': m_chamber = 'upper' elif m_title == 'Representative': m_chamber = 'lower' else: m_chamber = None if role in ('Chair', 'Co-Chair', 'Vice Chair', 'Member', 'Advisory', 'Ranking Member'): if chamber == 'legislature': m_role = 'interim {}'.format(role.lower()) else: m_role = role.lower() else: m_role = None if m_role: members.append(Member(name=m_name, role=m_role, chamber=m_chamber)) # Interim committees are collected during the scraping # for joint committees, and most interim committees # have members from both chambers. However, a small # number of interim committees (right now, just 1) have # only members from one chamber, so the chamber is set # to their chamber instead of 'legislature' for those # committees. if chamber == 'legislature': m_chambers = set( [mem.chamber for mem in members]) if len(m_chambers) == 1: chamber = m_chambers.pop() committee = Organization(name=clean_committee_name(c_name), chamber=chamber, classification='committee') for member in members: committee.add_member(member.name, member.role) committee.add_source(committee_url) if not committee._related: self.warning( 'skipping blank committee {0} ' 'at {1}'.format(c_name, committee_url)) else: yield committee else: self.warning('No legislative committee found at ' '{}'.format(committee_url))
def scrape_chamber(self, chamber): if chamber == 'lower': url = 'http://www.scstatehouse.gov/member.php?chamber=H' else: url = 'http://www.scstatehouse.gov/member.php?chamber=S' seen_committees = {} data = self.get(url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(url) for a in doc.xpath('//a[@class="membername"]'): full_name = a.text leg_url = a.get('href') if full_name.startswith('Senator'): full_name = full_name.replace('Senator ', '') if full_name.startswith('Representative'): full_name = full_name.replace('Representative ', '') leg_html = self.get(leg_url).text leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) if 'Resigned effective' in leg_html: self.info('Resigned') continue party, district, _ = leg_doc.xpath('//p[@style="font-size: 17px;' ' margin: 0 0 0 0; padding: 0;"]/text()') if 'Republican' in party: party = 'Republican' elif 'Democrat' in party: party = 'Democratic' # District # - County - Map district = district.split()[1] try: photo_url = leg_doc.xpath('//img[contains(@src,"/members/")]/@src')[0] except IndexError: self.warning("No Photo URL for {}".format(full_name)) photo_url = '' person = Person(name=full_name, district=district, party=party, primary_org=chamber, image=photo_url) # office address / phone try: addr_div = leg_doc.xpath('//div[@style="float: left; width: 225px;' ' margin: 10px 5px 0 20px; padding: 0;"]')[0] capitol_address = addr_div.xpath('p[@style="font-size: 13px;' ' margin: 0 0 10px 0; padding: 0;"]' )[0].text_content() phone = addr_div.xpath('p[@style="font-size: 13px;' ' margin: 0 0 0 0; padding: 0;"]/text()')[0] capitol_phone = phone.strip() if capitol_address: person.add_contact_detail(type='address', value=capitol_address, note='Capitol Office') if capitol_phone: person.add_contact_detail(type='voice', value=capitol_phone, note='Capitol Office') except IndexError: self.warning('no capitol address for {0}'.format(full_name)) # home address / phone try: addr_div = leg_doc.xpath('//div[@style="float: left;' ' width: 225px; margin: 10px 0 0 20px;"]')[0] addr = addr_div.xpath('p[@style="font-size: 13px;' ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content() phone = addr_div.xpath('p[@style="font-size: 13px;' ' margin: 0 0 0 0; padding: 0;"]/text()')[0] phone = phone.strip() if addr: person.add_contact_detail(type='address', value=addr, note='District Office') if phone: person.add_contact_detail(type='voice', value=phone, note='District Office') except IndexError: self.warning('no district address for {0}'.format(full_name)) person.add_link(leg_url) person.add_source(url) person.add_source(leg_url) # committees (skip first link) for com in leg_doc.xpath('//a[contains(@href, "committee.php")]')[1:]: if com.text.endswith(', '): committee, role = com.text_content().rsplit(', ', 1) # known roles role = {'Treas.': 'treasurer', 'Secy.': 'secretary', 'Secy./Treas.': 'secretary/treasurer', 'V.C.': 'vice-chair', '1st V.C.': 'first vice-chair', 'Co 1st V.C.': 'co-first vice-chair', '2nd V.C.': 'second vice-chair', '3rd V.C.': 'third vice-chair', 'Ex.Officio Member': 'ex-officio member', 'Chairman': 'chairman'}[role] else: committee = com.text role = 'member' # only yield each committee once if committee not in seen_committees: com = Organization(name=committee, classification='committee', chamber=chamber) com.add_source(url) seen_committees[committee] = com yield com else: com = seen_committees[committee] person.add_membership(com, role=role) yield person
def scrape(self): committee_d = {} non_committees = { 'City Council', 'Office of the Mayor', 'Office of the City Clerk' } for councilman, committees in self.councilMembers(): if councilman['Ward/Office'] == "": continue ward = councilman['Ward/Office'] if ward not in {"Mayor", "Clerk"}: ward = "Ward {}".format(int(ward)) role = "Alderman" p = Person(councilman['Person Name']['label'], district=ward, primary_org="legislature", role=role) if councilman['Photo']: p.image = councilman['Photo'] contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if councilman[contact_type]: p.add_contact_detail(type=type_, value=councilman[contact_type], note=_note) if councilman["E-mail"]: p.add_contact_detail(type="email", value=councilman['E-mail']['label'], note='E-mail') if councilman['Website']: p.add_link(councilman['Website']['url']) p.add_source(councilman['Person Name']['url'], note='web') for committee, _, _ in committees: committee_name = committee['Legislative Body']['label'] if committee_name and committee_name not in non_committees: o = committee_d.get(committee_name, None) if o is None: o = Organization( committee_name, classification='committee', parent_id={'name': 'Chicago City Council'}) o.add_source(committee['Legislative Body']['url'], note='web') committee_d[committee_name] = o o.add_member(p, role=committee["Title"]) yield p for name, term in FORMER_ALDERMEN.items(): p = Person(name=name, primary_org="legislature", start_date=term['term'][0], end_date=term['term'][1], district="Ward {}".format(term['ward']), role='Alderman') if name == 'Chandler, Michael D.': p.add_term('Alderman', "legislature", district="Ward {}".format(term['ward']), start_date=datetime.date(2011, 5, 16), end_date=datetime.date(2015, 5, 18)) p.add_source(term['source'], note='web') yield p for o in committee_d.values(): yield o for committee_name in FORMER_COMMITTEES: o = Organization(committee_name, classification='committee', parent_id={'name': 'Chicago City Council'}) o.add_source("https://chicago.legistar.com/Departments.aspx", note='web') yield o for joint_committee in JOINT_COMMITTEES: o = Organization(joint_committee, classification='committee', parent_id={'name': 'Chicago City Council'}) o.add_source("https://chicago.legistar.com/Departments.aspx", note='web') yield o
def scrape_upper(self): # Retrieve index list of committees. url = "http://senate.ca.gov/committees" doc = self.lxmlize(url) standing_committees = doc.xpath( '//h2[text()="Standing Committees"]/../following-sibling::div//a' ) sub_committees = doc.xpath( '//h2[text()="Sub Committees"]/../following-sibling::div//a' ) joint_committees = doc.xpath( '//h2[text()="Joint Committees"]/../following-sibling::div//a' ) other_committees = doc.xpath( '//h2[text()="Other"]/../following-sibling::div//a' ) # Iterates over each committee [link] found. for committee in ( standing_committees + sub_committees + joint_committees + other_committees ): # Get the text of the committee link, which should be the name of # the committee. (comm_name,) = committee.xpath("text()") org = Organization( chamber="upper", name=comm_name, classification="committee" ) (comm_url,) = committee.xpath("@href") org.add_source(comm_url) comm_doc = self.lxmlize(comm_url) if comm_name.startswith("Joint"): org["chamber"] = "legislature" org["committee"] = ( comm_name.replace("Joint ", "") .replace("Committee on ", "") .replace(" Committee", "") ) if comm_name.startswith("Subcommittee"): (full_comm_name,) = comm_doc.xpath( '//div[@class="banner-sitename"]/a/text()' ) full_comm_name = re.search( r"^Senate (.*) Committee$", full_comm_name ).group(1) org["committee"] = full_comm_name comm_name = re.search(r"^Subcommittee.*?on (.*)$", comm_name).group(1) org["subcommittee"] = comm_name # Special case of members list being presented in text blob. member_blob = comm_doc.xpath( 'string(//div[contains(@class, "field-item") and ' 'starts-with(text(), "Senate Membership:")][1]/text()[1])' ) if member_blob: # Separate senate membership from assembly membership. # This should strip the header from assembly membership # string automatically. delimiter = "Assembly Membership:\n" senate_members, delimiter, assembly_members = member_blob.partition( delimiter ) # Strip header from senate membership string. senate_members = senate_members.replace("Senate Membership:\n", "") # Clean membership strings. senate_members = senate_members.strip() assembly_members = assembly_members.strip() # Parse membership strings into lists. senate_members = senate_members.split("\n") assembly_members = assembly_members.split("\n") members = senate_members + assembly_members # Typical membership list format. else: members = comm_doc.xpath( '//a[(contains(@href, "/sd") or ' 'contains(@href, "assembly.ca.gov/a")) and ' '(starts-with(text(), "Senator") or ' 'starts-with(text(), "Assembly Member"))]/text()' ) for member in members: if not member.strip(): continue (mem_name, mem_role) = re.search( r"""(?ux) ^(?:Senator|Assembly\sMember)\s # Legislator title (.+?) # Capture the senator's full name (?:\s\((.{2,}?)\))? # There may be role in parentheses (?:\s\([RD]\))? # There may be a party affiliation \s*$ """, member, ).groups() org.add_member(mem_name, role=mem_role if mem_role else "member") if not org._related: self.warning("No members found for committee {}".format(comm_name)) yield org
def get_organizations(self): org = Organization(name="Board of Directors", classification="legislature") org.add_post( 'Mayor of the City of Los Angeles', 'Board Member', division_id='ocd-division/country:us/state:ca/place:los_angeles') for district in range(1, 6): org.add_post( 'Los Angeles County Board Supervisor, District {}'.format( district), 'Board Member', division_id= 'ocd-division/country:us/state:ca/county:los_angeles/council_district:{}' .format(district)) org.add_post( 'Appointee of Mayor of the City of Los Angeles', 'Board Member', division_id='ocd-division/country:us/state:ca/place:los_angeles') org.add_post('Appointee of Governor of California', 'Nonvoting Board Member', division_id='ocd-division/country:us/state:ca') org.add_post( 'Appointee of Los Angeles County City Selection Committee, North County/San Fernando Valley sector', 'Board Member', division_id= 'ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:north_county_san_fernando_valley' ) org.add_post( 'Appointee of Los Angeles County City Selection Committee, Southwest Corridor sector', 'Board Member', division_id= 'ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:southwest_corridor' ) org.add_post( 'Appointee of Los Angeles County City Selection Committee, San Gabriel Valley sector', 'Board Member', division_id= 'ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:san_gabriel_valley' ) org.add_post( 'Appointee of Los Angeles County City Selection Committee, Southeast Long Beach sector', 'Board Member', division_id= 'ocd-division/country:us/state:ca/county:los_angeles/la_metro_sector:southeast_long_beach' ) org.add_post('Chair', 'Chair') org.add_post('1st Vice Chair', '1st Vice Chair') org.add_post('2nd Vice Chair', '2nd Vice Chair') org.add_post("Chief Executive Officer", "Chief Executive Officer") yield org org = Organization(name="Crenshaw Project Corporation", classification="corporation") org.add_source( 'https://metro.legistar.com/DepartmentDetail.aspx?ID=32216&GUID=D790CC05-ACCB-451C-B576-2952090769F1' ) yield org org = Organization(name="LA SAFE", classification="corporation") org.add_source( 'https://metro.legistar.com/DepartmentDetail.aspx?ID=30222&GUID=5F27DA83-633F-4FEA-A4B0-0477551061B6&R=aef57793-1826-4cfa-b6e3-d6b42cf77527' ) yield org
def scrape(self): session = self.latest_session() subcomms = self.get_subcommittee_info(session) api_base_url = "https://api.iga.in.gov" html_base_url = "http://iga.in.gov/legislative/{}/committees/".format( session) client = ApiClient(self) r = client.get("committees", session=session) all_pages = client.unpaginate(r) for comm_info in all_pages: # this is kind of roundabout, but needed in order # to take advantage of all of our machinery to make # sure we're not overloading their api comm_link = comm_info["link"] comm_name = comm_link.split("/")[-1] if "withdrawn" in comm_name or "conference" in comm_name: continue try: comm_json = client.get("committee", committee_link=comm_link[1:]) except HTTPError: self.logger.warning("Page does not exist") continue try: chamber = comm_json["chamber"]["name"] except KeyError: chamber = 'joint' else: if chamber == "Senate": chamber = "upper" elif chamber == "House": chamber = "lower" else: raise AssertionError( "Unknown committee chamber {}".format(chamber)) name = comm_json["name"] try: owning_comm = subcomms[name] except KeyError: name = name.replace("Statutory Committee on", "").strip() comm = Organization(name=name, chamber=chamber, classification='committee') if name in subcomms.values(): # Avoid identification issues, if committee names are re-used # between upper and lower chambers assert self._parent_committees.get(name) is None self._parent_committees[name] = comm else: name = name.replace("Statutory Committee on", "").replace("Subcommittee", "").strip() comm = Organization( name=name, parent_id=self._parent_committees[owning_comm], classification='committee') chair = self.process_special_members(comm, comm_json, "chair") vicechair = self.process_special_members(comm, comm_json, "viceChair") ranking = self.process_special_members(comm, comm_json, "rankingMinMember") # leadership is also listed in membership # so we have to make sure we haven't seen them yet comm_members = [m for m in [chair, vicechair, ranking] if m] for mem in comm_json["members"]: mem_name = mem["firstName"] + " " + mem["lastName"] if mem_name not in comm_members: comm_members.append(mem_name) comm.add_member(mem_name) api_source = api_base_url + comm_link if comm_name[:10] == "committee_": html_source = html_base_url + comm_name[10:] comm.add_source(html_source) comm.add_source(api_source) yield comm