def test_person_add_membership_name(): p = Person('Leonardo DiCaprio') p.add_membership('Academy of Motion Picture Arts and Sciences', role='winner', start_date='2016') p._related[0].validate() assert get_pseudo_id(p._related[0].organization_id) == { 'name': 'Academy of Motion Picture Arts and Sciences'} assert p._related[0].person_id == p._id assert p._related[0].role == 'winner' assert p._related[0].start_date == '2016'
def test_person_add_membership(): p = Person('Bob B. Bear') p.add_source('http://example.com') o = Organization('test org') p.add_membership(o, 'member', start_date='2007') assert len(p._related) == 1 p._related[0].validate() assert p._related[0].person_id == p._id assert p._related[0].organization_id == o._id assert p._related[0].start_date == '2007'
def test_person_add_membership(): p = Person('Bob B. Bear') p.add_source('http://example.com') o = Organization('test org', classification='unknown') p.add_membership(o, role='member', start_date='2007') assert len(p._related) == 1 p._related[0].validate() assert p._related[0].person_id == p._id assert p._related[0].organization_id == o._id assert p._related[0].start_date == '2007'
def test_person_add_membership_org(): p = Person('Bob B. Bear') p.add_source('http://example.com') o = Organization('test org', classification='unknown') p.add_membership(o, role='member', start_date='2007', end_date=datetime.date(2015, 5, 8)) assert len(p._related) == 1 p._related[0].validate() assert p._related[0].person_id == p._id assert p._related[0].organization_id == o._id assert p._related[0].start_date == '2007' assert p._related[0].end_date == datetime.date(2015, 5, 8)
def scrape(self): urls = Urls(dict(list=legislators_url), self) council = Organization( 'Temecula City Council', classification='legislature') council.add_source(urls.list.url) yield council for tr in urls.list.xpath('//table[2]//tr')[1:]: # Parse some attributes. name, role = tr.xpath('td/p[1]//font/text()') image = tr.xpath('td/img/@src').pop() # Create legislator. person = Person(name, image=image) # Add membership on council. memb = person.add_membership(council, role=role) # Add email address. email, detail_url = tr.xpath('td//a/@href') email = email[7:] memb.contact_details.append( dict(type='email', value=email, note='work')) # Add sources. person.add_source(urls.list.url) person.add_source(detail_url) yield person
def scrape(self): urls = Urls(dict(list=legislators_url), self) council = Organization('Temecula City Council', classification='legislature') council.add_source(urls.list.url) yield council for tr in urls.list.xpath('//table[2]//tr')[1:]: # Parse some attributes. name, role = tr.xpath('td/p[1]//font/text()') image = tr.xpath('td/img/@src').pop() # Create legislator. person = Person(name, image=image) # Add membership on council. memb = person.add_membership(council, role=role) # Add email address. email, detail_url = tr.xpath('td//a/@href') email = email[7:] memb.contact_details.append( dict(type='email', value=email, note='work')) # Add sources. person.add_source(urls.list.url) person.add_source(detail_url) yield person
def scrape(self): urls = Urls(dict(list=legislators_url), self) council = Organization('Boise City Council') council.add_source(legislators_url) yield council xpath = '//div[@id="content"]/div/a/@href' people_urls = urls.list.xpath(xpath) # SKip the mayor because his page has no name or email. people_urls = people_urls[1:] for url in people_urls: urls.add(detail=url) # Parse some attributes. image = urls.detail.xpath('//div[@id="content"]/p/img/@src').pop() name = urls.detail.xpath('//h1/text()').pop() name = name.replace('Council ', '') role, _, name = name.partition(' ') # Create legislator. person = Person(name, image=image) # Add membership on council. memb = person.add_membership(council, role=role) memb.add_source(urls.detail.url) # Add email address. email_xpath = '//a[contains(@href, "mailto")]/@href' email = urls.detail.xpath(email_xpath).pop()[7:] memb.contact_details.append( dict(type='email', value=email, note='work')) # Add sources. person.add_source(urls.list.url) person.add_source(urls.detail.url) yield person
def scrape(self): web_scraper = LegistarPersonScraper( requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody' if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member, _ in web_scraper.councilMembers(): name = member['Person Name']['label'].strip() web_info[name] = member city_council, = [ body for body in self.bodies() if body['BodyName'] == 'City Council' ] terms = collections.defaultdict(list) public_advocates = { # Match casing to Bill De Blasio as council member 'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio', 'The Public Advocate (Ms. James)': 'Letitia James', } for office in self.body_offices(city_council): name = office['OfficeRecordFullName'] name = public_advocates.get(name, name).strip() terms[name].append(office) # Add past members (and advocates public) if name not in web_info: web_info[name] = collections.defaultdict(lambda: None) # Check that we have everyone we expect, formatted consistently, in # both information arrays. For instance, this will fail if we forget to # strip trailing spaces from names on one side or the other (which has # the effect of omitting information, such as post, from the scrape). assert set(web_info.keys()) == set(terms.keys()) members = {} for member, offices in terms.items(): p = Person(member) web = web_info[member] for term in offices: role = term['OfficeRecordTitle'] if role == 'Public Advocate': role = 'Non-Voting Council Member' else: role = 'Council Member' district = web.get('District', '').replace(' 0', ' ') p.add_term(role, 'legislature', district=district, start_date=self.toDate( term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) party = web.get('Political Party') if party == 'Democrat': party = 'Democratic' if party: p.add_party(party) if web.get('Photo'): p.image = web['Photo'] contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if web.get(contact_type) and web(contact_type) != 'N/A': p.add_contact_detail(type=type_, value=web[contact_type], note=_note) if web.get('E-mail'): p.add_contact_detail(type="email", value=web['E-mail']['url'], note='E-mail') if web.get('Web site'): p.add_link(web['Web site']['url'], note='web site') if web.get('Notes'): p.extras = {'Notes': web['Notes']} if not p.sources: # Only add sources once source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p committee_types = [ 'Committee', 'Inactive Committee', 'Select Committee', 'Subcommittee', 'Task Force', 'Land Use' ] # Committee on Land Use body_types = { k: v for k, v in self.body_types().items() if k in committee_types } for body in self.bodies(): if body['BodyTypeName'] in body_types \ or body['BodyName'] in ('Legislative Documents Unit', 'Legal and Government Affairs Division'): # Skip typo in API data if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services': continue parent_org = PARENT_ORGS.get(body['BodyName'], 'New York City Council') body_name = body['BodyName'] o = Organization(body_name, classification='committee', parent_id={'name': parent_org}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') for office in self.body_offices(body): # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio', # 'Committee Member', None, 'CHAIRPERSON' role = office['OfficeRecordTitle'] if role and role.lower() == 'chairperson': role = 'Chairperson' else: role = 'Member' person = office['OfficeRecordFullName'] person = public_advocates.get(person, person).strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(o, role=role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) yield o for p in members.values(): yield p
def scrape_chamber(self, chamber): body = {'lower': 'H', 'upper': 'S'}[chamber] url = 'http://www.azleg.gov/MemberRoster/?body=' + body page = self.get(url).text # there is a bad comment closing tag on this page page = page.replace('--!>', '-->') root = html.fromstring(page) path = '//table//tr' roster = root.xpath(path)[1:] for row in roster: position = '' name, district, party, email, room, phone, = row.xpath('td') if email.attrib.get('class') == 'vacantmember': continue # Skip any vacant members. link = name.xpath('string(a/@href)') if len(name) == 1: name = name.text_content().strip() else: position = name.tail.strip() name = name[0].text_content().strip() if '--' in name: name = name.split('--')[0].strip() linkpage = self.get(link).text linkpage = linkpage.replace('--!>', '-->') linkroot = html.fromstring(linkpage) linkroot.make_links_absolute(link) photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]") if len(photos) != 1: self.warning('no photo on ' + link) photo_url = '' else: photo_url = photos[0].attrib['src'] district = district.text_content() party = party.text_content().strip() email = email.text_content().strip() if email.startswith('Email: '): email = email.replace('Email: ', '').lower() + '@azleg.gov' else: email = '' party = self.get_party(party) room = room.text_content().strip() if chamber == 'lower': address = "House of Representatives\n" else: address = "Senate\n" address = address + "1700 West Washington\n Room " + room \ + "\nPhoenix, AZ 85007" phone = phone.text_content().strip() if '602' not in re.findall(r'(\d+)', phone): phone = "602-" + phone leg = Person(primary_org=chamber, image=photo_url, name=name, district=district, party=party) leg.add_contact_detail(type='address', value=address, note='Capitol Office') leg.add_contact_detail(type='voice', value=phone, note='Capitol Office') leg.add_party(party=party) leg.add_link(link) if email: leg.add_contact_detail(type='email', value=email) if position: leg.add_membership(name_or_org=party, role=position) # leg.add_role(position, term, chamber=chamber, # district=district, party=party) leg.add_source(url) # Probably just get this from the committee scraper # self.scrape_member_page(link, session, chamber, leg) yield leg
def scrape(self): body_types = self.body_types() city_council, = [body for body in self.bodies() if body["BodyName"] == "City Council"] terms = collections.defaultdict(list) for office in self.body_offices(city_council): if "VACAN" not in office["OfficeRecordFullName"]: terms[office["OfficeRecordFullName"].strip()].append(office) web_scraper = LegistarPersonScraper(requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = "https://pittsburgh.legistar.com/People.aspx" web_scraper.COMMITTEELIST = "https://pittsburgh.legistar.com/Departments.aspx" if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member in web_scraper.councilMembers(): web_info[member["Person Name"]] = member members = {} for member, offices in terms.items(): person = Person(member) for term in offices: role = term["OfficeRecordTitle"] person.add_term("Councilmember", "legislature", start_date = self.toDate(term["OfficeRecordStartDate"]), end_date = self.toDate(term["OfficeRecordEndDate"])) if member in web_info: web = web_info[member] if web["E-mail"] and web["E-mail"]["label"] and web["E-mail"]["label"] != "N/A": person.add_contact_detail(type="email", value=web["E-mail"]["label"], note="E-mail") person_source_data = self.person_sources_from_office(term) person_api_url, person_api_response = person_source_data person.add_source(person_api_url, note="api") if person_api_response["PersonAddress1"]: address = (person_api_response["PersonAddress1"] + ", " + person_api_response["PersonCity1"] + ", " + person_api_response["PersonState1"] + " " + person_api_response["PersonZip1"]) person.add_contact_detail(type="address", value=address, note="Office address") if person_api_response["PersonPhone"]: person.add_contact_detail(type="voice", value=person_api_response["PersonPhone"], note="Office phone") if person_api_response["PersonWWW"]: person.add_contact_detail(type="url", value=person_api_response["PersonWWW"], note="District website") members[member] = person for body in self.bodies(): if body["BodyTypeId"] == body_types["Committee"]: body_name_clean = body["BodyName"].strip() organization = Organization(body_name_clean, classification="committee", parent_id={"name" : "Pittsburgh City Council"}) organization.add_source(self.BASE_URL + "/bodies/{BodyId}".format(**body), note="api") for office in self.body_offices(body): role = office["OfficeRecordMemberType"] if role not in ("Vice Chair", "Chair") or role == "Councilmember": role = "Member" person = office["OfficeRecordFullName"].strip() if person in members: person = members[person] else: person = Person(person) person.add_membership(body_name_clean, role=role, start_date = self.toDate(office["OfficeRecordStartDate"]), end_date = self.toDate(office["OfficeRecordEndDate"])) yield organization for person in members.values(): yield person
def scrape(self): urls = Urls(dict(list=legislators_url), self) council = Organization('Denver City Council') council.add_source(legislators_url) # Get image urls, names, detail urls, and districts. image_xpath = '//a[contains(@href, "councildistrict")]/img/@src' image_urls = urls.list.xpath(image_xpath) name_xpath = '//a[contains(@href, "councildistrict")]' names = [a.text_content() for a in urls.list.xpath(name_xpath)][:-1] names = filter(None, names) person_urls_xpath = '//a[contains(@href, "councildistrict")]/@href' person_urls = urls.list.xpath(person_urls_xpath) post_ids = [] xpath = '//a[contains(@href, "councildistrict")]/img/ancestor::td' for td in urls.list.xpath(xpath): text = td.text_content() m = re.search('Council District \d+', text) if m: post_ids.append(m.group()) continue m = re.search('Council At-Large', text) if m: post_ids.append('Council At-Large') for post_id in post_ids: council.add_post(post_id, post_id) yield council data = zip(image_urls, names, person_urls, post_ids) for image_url, name, person_url, post_id in data: # Create legislator. person = Person(name, image=image_url) # Add sources. urls.add(detail=person_url) person.add_source(urls.list.url, note='list') person.add_source(urls.detail.url, note='detail') # Add membership on council. memb = person.add_membership(council, district=post_id.strip()) memb.add_source(urls.detail.url) xpath = '//div[@id="dnn_column3"]' contact_text = urls.detail.xpath(xpath)[0].text_content() if not contact_text.strip(): xpath = '//div[contains(@id, "dnn_RightPaneWide")]' contact_text = urls.detail.xpath(xpath)[0].text_content() phone_regex = r'\(\d{3}\)[ -]*\d{3}-\d{4}' phone = re.search(phone_regex, contact_text).group() memb.contact_details.append( dict(type='phone', value=phone, note='work')) # Add email address. email_regex = r'\[email protected]' email = re.search(email_regex, contact_text).group() memb.contact_details.append( dict(type='email', value=email, note='work')) yield person
def scrape(self): ''' Scrape the web to create a dict with all active organizations. Then, we can access the correct URL for the organization detail page. ''' web_scraper = LegistarPersonScraper( requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = 'https://metro.legistar.com/People.aspx' web_info = {} for _, organizations in web_scraper.councilMembers(): for organization, _, _ in organizations: organization_name = organization['Department Name'][ 'label'].strip() organization_info = organization['Department Name'] web_info[organization_name] = organization_info body_types = self.body_types() board_of_directors, = [ body for body in self.bodies() if body['BodyName'] == 'Board of Directors - Regular Board Meeting' ] board_of_directors["BodyName"] = "Board of Directors" terms = collections.defaultdict(list) for office in self.body_offices(board_of_directors): terms[office['OfficeRecordFullName']].append(office) members = {} for member, offices in terms.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] if role not in {'Board Member', 'non-voting member'}: p.add_term( role, 'legislature', start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate']), appointment=True) if role != 'Chief Executive Officer': if role == 'non-voting member': member_type = 'Nonvoting Board Member' post = NONVOTING_POSTS.get(member) else: member_type = 'Board Member' post = VOTING_POSTS.get(member) start_date = self.toDate(term['OfficeRecordStartDate']) end_date = self.toDate(term['OfficeRecordEndDate']) board_membership = p.add_term(member_type, 'legislature', district=post, start_date=start_date, end_date=end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get( p.name) if acting_member_end_date and acting_member_end_date <= end_date: board_membership.extras = {'acting': 'true'} source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: organization_name = body['BodyName'].strip() o = Organization(organization_name, classification='committee', parent_id={'name': 'Board of Directors'}) organization_info = web_info.get(organization_name, {}) organization_url = organization_info.get( 'url', self.WEB_URL + 'https://metro.legistar.com/Departments.aspx') o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(organization_url, note='web') for office in self.body_offices(body): role = office['OfficeRecordTitle'] if role not in ("Chair", "Vice Chair", "Chief Executive Officer"): if role == 'non-voting member': role = 'Nonvoting Member' else: role = 'Member' person = office['OfficeRecordFullName'] if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p start_date = self.toDate(office['OfficeRecordStartDate']) end_date = self.toDate(office['OfficeRecordEndDate']) membership = p.add_membership(organization_name, role=role, start_date=start_date, end_date=end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get( p.name) if acting_member_end_date and acting_member_end_date <= end_date: membership.extras = {'acting': 'true'} yield o for p in members.values(): yield p
def scrape(self): body_types = self.body_types() city_council, = [body for body in self.bodies() if body['BodyName'] == 'City Council '] terms = collections.defaultdict(list) for office in self.body_offices(city_council): if office['OfficeRecordFullName'] != "Granicus BA": terms[office['OfficeRecordFullName']].append(office) members = {} for member, offices in terms.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] p.add_term(role, 'legislature', # district = "District {}".format(int(web['District/Office'])), start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Standing Committees']: o = Organization(body['BodyName'], classification='committee', parent_id={'name': 'Sacramento City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web') for office in self.body_offices(body): # messed up record for joanna thompson if office['OfficeRecordId'] == 1055: continue role = office['OfficeRecordTitle'] if role not in ("Vice Chair", "Chairperson"): role = 'Member' person = office['OfficeRecordFullName'].strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(body['BodyName'], role=role, start_date=self.toDate(office['OfficeRecordStartDate']), end_date=self.toDate(office['OfficeRecordEndDate'])) yield o for body in self.bodies(): if body['BodyTypeId'] == body_types['Boards or Commission']: o = Organization(body['BodyName'], classification='commission', parent_id={'name': 'Sacramento City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web') yield o for p in members.values(): yield p
def scrape(self): body_types = self.body_types() board_of_directors, = [ body for body in self.bodies() if body['BodyName'] == 'Board of Directors - Regular Board Meeting' ] board_of_directors["BodyName"] = "Board of Directors" terms = collections.defaultdict(list) for office in self.body_offices(board_of_directors): terms[office['OfficeRecordFullName']].append(office) members = {} for member, offices in terms.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] if role not in {'Board Member', 'non-voting member'}: p.add_term( role, 'legislature', start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate']), appointment=True) if role != 'Chief Executive Officer': if role == 'non-voting member': member_type = 'Nonvoting Board Member' post = NONVOTING_POSTS.get(member) else: member_type = 'Board Member' post = VOTING_POSTS.get(member) p.add_term( member_type, 'legislature', district=post, start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: org_name = body['BodyName'].strip() o = Organization(org_name, classification='committee', parent_id={'name': 'Board of Directors'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') for office in self.body_offices(body): role = office['OfficeRecordTitle'] if role not in ("Chair", "Vice Chair"): if role == 'non-voting member': role = 'Nonvoting Member' else: role = 'Member' person = office['OfficeRecordFullName'] if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(org_name, role=role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) yield o for p in members.values(): yield p
def scrape(self): body_types = self.body_types() city_council, = [ body for body in self.bodies() if body['BodyName'] == 'City Council' ] terms = collections.defaultdict(list) for office in self.body_offices(city_council): if 'VACAN' not in office['OfficeRecordFullName']: terms[office['OfficeRecordFullName'].strip()].append(office) web_scraper = LegistarPersonScraper(None, None) web_scraper.MEMBERLIST = 'https://chicago.legistar.com/DepartmentDetail.aspx?ID=12357&GUID=4B24D5A9-FED0-4015-9154-6BFFFB2A8CB4&R=8bcbe788-98cd-4040-9086-b34fa8e49881' web_scraper.ALL_MEMBERS = '3:3' web_info = {} for member, _ in web_scraper.councilMembers( {'ctl00$ContentPlaceHolder$lstName': 'City Council'}): web_info[member['Person Name']['label']] = member web_info['Balcer, James'] = collections.defaultdict(lambda: None) web_info['Fioretti, Bob'] = collections.defaultdict(lambda: None) web_info['Balcer, James']['Ward/Office'] = 11 web_info['Fioretti, Bob']['Ward/Office'] = 2 members = {} for member, offices in terms.items(): web = web_info[member] p = Person(member) for term in offices: role = term['OfficeRecordTitle'] p.add_term('Alderman', 'legislature', district="Ward {}".format(int(web['Ward/Office'])), start_date=self.toDate( term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) if web.get('Photo'): p.image = web['Photo'] contact_types = { "City Hall Address": ("address", "City Hall Address"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if web[contact_type] and web[contact_type] != 'N/A': p.add_contact_detail(type=type_, value=web[contact_type], note=_note) if web["E-mail"] and web["E-mail"][ "label"] and web["E-mail"]["label"] != 'N/A': p.add_contact_detail(type="email", value=web['E-mail']['label'], note='E-mail') if web['Website']: p.add_link(web['Website']['url']) source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name': 'Chicago City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') for office in self.body_offices(body): # messed up record for joanna thompson if office['OfficeRecordId'] == 1055: continue role = office['OfficeRecordTitle'] if role not in ("Vice Chair", "Chairman"): role = 'Member' person = office['OfficeRecordFullName'].strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(body['BodyName'], role=role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) yield o for body in self.bodies(): if body['BodyTypeId'] == body_types['Joint Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name': 'Chicago City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') yield o for p in members.values(): yield p
def scrape_chamber(self, chamber): if chamber == 'lower': url = 'http://www.scstatehouse.gov/member.php?chamber=H' else: url = 'http://www.scstatehouse.gov/member.php?chamber=S' seen_committees = {} data = self.get(url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(url) for a in doc.xpath('//a[@class="membername"]'): full_name = a.text leg_url = a.get('href') if full_name.startswith('Senator'): full_name = full_name.replace('Senator ', '') if full_name.startswith('Representative'): full_name = full_name.replace('Representative ', '') leg_html = self.get(leg_url).text leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) if 'Resigned effective' in leg_html: self.info('Resigned') continue party, district, _ = leg_doc.xpath('//p[@style="font-size: 17px;' ' margin: 0 0 0 0; padding: 0;"]/text()') if 'Republican' in party: party = 'Republican' elif 'Democrat' in party: party = 'Democratic' # District # - County - Map district = district.split()[1] try: photo_url = leg_doc.xpath('//img[contains(@src,"/members/")]/@src')[0] except IndexError: self.warning("No Photo URL for {}".format(full_name)) photo_url = '' person = Person(name=full_name, district=district, party=party, primary_org=chamber, image=photo_url) # office address / phone try: addr_div = leg_doc.xpath('//div[@style="float: left; width: 225px;' ' margin: 10px 5px 0 20px; padding: 0;"]')[0] capitol_address = addr_div.xpath('p[@style="font-size: 13px;' ' margin: 0 0 10px 0; padding: 0;"]' )[0].text_content() phone = addr_div.xpath('p[@style="font-size: 13px;' ' margin: 0 0 0 0; padding: 0;"]/text()')[0] capitol_phone = phone.strip() if capitol_address: person.add_contact_detail(type='address', value=capitol_address, note='Capitol Office') if capitol_phone: person.add_contact_detail(type='voice', value=capitol_phone, note='Capitol Office') except IndexError: self.warning('no capitol address for {0}'.format(full_name)) # home address / phone try: addr_div = leg_doc.xpath('//div[@style="float: left;' ' width: 225px; margin: 10px 0 0 20px;"]')[0] addr = addr_div.xpath('p[@style="font-size: 13px;' ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content() phone = addr_div.xpath('p[@style="font-size: 13px;' ' margin: 0 0 0 0; padding: 0;"]/text()')[0] phone = phone.strip() if addr: person.add_contact_detail(type='address', value=addr, note='District Office') if phone: person.add_contact_detail(type='voice', value=phone, note='District Office') except IndexError: self.warning('no district address for {0}'.format(full_name)) person.add_link(leg_url) person.add_source(url) person.add_source(leg_url) # committees (skip first link) for com in leg_doc.xpath('//a[contains(@href, "committee.php")]')[1:]: if com.text.endswith(', '): committee, role = com.text_content().rsplit(', ', 1) # known roles role = {'Treas.': 'treasurer', 'Secy.': 'secretary', 'Secy./Treas.': 'secretary/treasurer', 'V.C.': 'vice-chair', '1st V.C.': 'first vice-chair', 'Co 1st V.C.': 'co-first vice-chair', '2nd V.C.': 'second vice-chair', '3rd V.C.': 'third vice-chair', 'Ex.Officio Member': 'ex-officio member', 'Chairman': 'chairman'}[role] else: committee = com.text role = 'member' # only yield each committee once if committee not in seen_committees: com = Organization(name=committee, classification='committee', chamber=chamber) com.add_source(url) seen_committees[committee] = com yield com else: com = seen_committees[committee] person.add_membership(com, role=role) yield person
def scrape(self): ''' Scrape the web to create a dict with all active organizations. Then, we can access the correct URL for the organization detail page. ''' web_scraper = LegistarPersonScraper(requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = 'https://metro.legistar.com/People.aspx' web_info = {} for _, organizations in web_scraper.councilMembers(): for organization, _, _ in organizations: organization_name = organization['Department Name']['label'].strip() organization_info = organization['Department Name'] web_info[organization_name] = organization_info body_types = self.body_types() board_of_directors, = [body for body in self.bodies() if body['BodyName'] == 'Board of Directors - Regular Board Meeting'] board_of_directors["BodyName"] = "Board of Directors" terms = collections.defaultdict(list) for office in self.body_offices(board_of_directors): terms[office['OfficeRecordFullName']].append(office) members = {} for member, offices in terms.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] if role not in {'Board Member', 'non-voting member'}: p.add_term(role, 'legislature', start_date = self.toDate(term['OfficeRecordStartDate']), end_date = self.toDate(term['OfficeRecordEndDate']), appointment = True) if role != 'Chief Executive Officer': if role == 'non-voting member': member_type = 'Nonvoting Board Member' post = NONVOTING_POSTS.get(member) else: member_type = 'Board Member' post = VOTING_POSTS.get(member) start_date = self.toDate(term['OfficeRecordStartDate']) end_date = self.toDate(term['OfficeRecordEndDate']) board_membership = p.add_term(member_type, 'legislature', district = post, start_date = start_date, end_date = end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(p.name) if acting_member_end_date and acting_member_end_date <= end_date: board_membership.extras = {'acting': 'true'} source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: organization_name = body['BodyName'].strip() o = Organization(organization_name, classification='committee', parent_id={'name' : 'Board of Directors'}) organization_info = web_info.get(organization_name, {}) organization_url = organization_info.get('url', self.WEB_URL + 'https://metro.legistar.com/Departments.aspx') o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(organization_url, note='web') for office in self.body_offices(body): role = office['OfficeRecordTitle'] if role not in ("Chair", "Vice Chair", "Chief Executive Officer"): if role == 'non-voting member': role = 'Nonvoting Member' else: role = 'Member' person = office['OfficeRecordFullName'] if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p start_date = self.toDate(office['OfficeRecordStartDate']) end_date = self.toDate(office['OfficeRecordEndDate']) membership = p.add_membership(organization_name, role=role, start_date=start_date, end_date=end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get(p.name) if acting_member_end_date and acting_member_end_date <= end_date: membership.extras = {'acting': 'true'} yield o for p in members.values(): yield p
def scrape(self): body_types = self.body_types() city_council, = [body for body in self.bodies() if body['BodyName'] == 'City Council'] terms = collections.defaultdict(list) for office in self.body_offices(city_council): if 'vacan' not in office['OfficeRecordFullName'].lower(): terms[office['OfficeRecordFullName'].strip()].append(office) web_scraper = LegistarPersonScraper(requests_per_minute = self.requests_per_minute) web_scraper.MEMBERLIST = 'https://chicago.legistar.com/DepartmentDetail.aspx?ID=12357&GUID=4B24D5A9-FED0-4015-9154-6BFFFB2A8CB4&R=8bcbe788-98cd-4040-9086-b34fa8e49881' web_scraper.ALL_MEMBERS = '3:3' if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member, _ in web_scraper.councilMembers({'ctl00$ContentPlaceHolder$lstName' : 'City Council'}): web_info[member['Person Name']['label']] = member web_info['Balcer, James'] = collections.defaultdict(lambda : None) web_info['Fioretti, Bob'] = collections.defaultdict(lambda : None) web_info['Balcer, James']['Ward/Office'] = 11 web_info['Fioretti, Bob']['Ward/Office'] = 2 members = {} for member, offices in terms.items(): web = web_info[member] p = Person(member) for term in offices: role = term['OfficeRecordTitle'] p.add_term('Alderman', 'legislature', district = "Ward {}".format(int(web['Ward/Office'])), start_date = self.toDate(term['OfficeRecordStartDate']), end_date = self.toDate(term['OfficeRecordEndDate'])) if web.get('Photo'): p.image = web['Photo'] contact_types = { "City Hall Address": ("address", "City Hall Address"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if web[contact_type] and web[contact_type] != 'N/A': p.add_contact_detail(type=type_, value= web[contact_type], note=_note) if web["E-mail"] and web["E-mail"]["label"] and web["E-mail"]["label"] != 'N/A': p.add_contact_detail(type="email", value=web['E-mail']['label'], note='E-mail') if web['Website']: p.add_link(web['Website']['url']) source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name' : 'Chicago City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web') for office in self.body_offices(body): # messed up record for joanna thompson if office['OfficeRecordId'] == 1055: continue role = office['OfficeRecordTitle'] if role not in ("Vice Chair", "Chairman"): role = 'Member' person = office['OfficeRecordFullName'].strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p try: end_date = self.toDate(office['OfficeRecordEndDate']) except TypeError: end_date = '' p.add_membership(body['BodyName'], role=role, start_date=self.toDate(office['OfficeRecordStartDate']), end_date=end_date) yield o for body in self.bodies(): if body['BodyTypeId'] == body_types['Joint Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name' : 'Chicago City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web') yield o for p in members.values(): yield p
def scrape(self): body_types = self.body_types() city_council, = [ body for body in self.bodies() if body['BodyName'] == 'City Council ' ] terms = collections.defaultdict(list) for office in self.body_offices(city_council): if office['OfficeRecordFullName'] != "Granicus BA": terms[office['OfficeRecordFullName']].append(office) members = {} for member, offices in terms.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] p.add_term( role, 'legislature', # district = "District {}".format(int(web['District/Office'])), start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] == body_types['Standing Committees']: o = Organization(body['BodyName'], classification='committee', parent_id={'name': 'Sacramento City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') for office in self.body_offices(body): # messed up record for joanna thompson if office['OfficeRecordId'] == 1055: continue role = office['OfficeRecordTitle'] if role not in ("Vice Chair", "Chairperson"): role = 'Member' person = office['OfficeRecordFullName'].strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(body['BodyName'], role=role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) yield o for body in self.bodies(): if body['BodyTypeId'] == body_types['Boards or Commission']: o = Organization(body['BodyName'], classification='commission', parent_id={'name': 'Sacramento City Council'}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') yield o for p in members.values(): yield p
def scrape(self): body_types = self.body_types() board_of_directors, = [ body for body in self.bodies() if body['BodyName'] == 'Board of Directors' ] members = {} for office in self.body_offices(board_of_directors): members.setdefault(office['OfficeRecordFullName'], []).append(office) for member, offices in members.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] if role != 'non-voting member': role = 'Board Member' post = VOTING_POSTS.get(member) else: role = 'Nonvoting Board Member' post = NONVOTING_POSTS.get(member) p.add_term(role, 'legislature', district=post, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate(office['OfficeRecordEndDate'])) legistar_api = self.BASE_URL + '/OfficeRecords/' p.add_source(legistar_api, note='api') print(p) yield p adjunct_members = {} for body in self.bodies(): if body['BodyTypeId'] == body_types['Committee']: o = Organization(body['BodyName'], classification='committee', parent_id={'name': 'Board of Directors'}) o.add_source(self.BASE_URL + '/Bodies/') for office in self.body_offices(body): role = office['OfficeRecordTitle'] if role not in ("Chair", "Vice Chair"): role = 'Member' person = office['OfficeRecordFullName'] if person not in members: if person not in adjunct_members: p = Person(person) p.add_source('foo') else: p = adjunct_members[person] p.add_membership(body['BodyName'], role=role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) adjunct_members[person] = p else: o.add_member(office['OfficeRecordFullName'], role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) yield o for p in adjunct_members.values(): yield p
def scrape(self): # chambers = [chamber] if chamber is not None else ['upper', 'lower'] leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv" page = self.get(leg_url) committees = {} # Ensure that the spreadsheet's structure hasn't generally changed _row_headers = page.text.split("\r\n")[0].replace('"', "").split(",") assert _row_headers == HEADERS, "Spreadsheet structure may have changed" page = open_csv(page) for row in page: chamber = {"H": "lower", "S": "upper"}[row["office code"]] district = row["dist"].lstrip("0") assert district.isdigit(), "Invalid district found: {}".format(district) name = row["first name"] mid = row["middle initial"].strip() if mid: name += " %s" % mid name += " %s" % row["last name"] suffix = row["suffix"].strip() if suffix: name += " %s" % suffix party = row["party"] if party == "Democrat": party = "Democratic" leg = Person(primary_org=chamber, name=name, district=district, party=party) legislator_url = row["URL"].replace("\\", "//").strip() if legislator_url != "": if not legislator_url.startswith("http"): legislator_url = "http://" leg.add_link(legislator_url) leg.add_party(party=party) office_address = "%s\nRoom %s\nHartford, CT 06106" % ( row["capitol street address"], row["room number"], ) # extra_office_fields = dict() email = row["email"].strip() if "@" not in email: if not email: email = None elif email.startswith("http://") or email.startswith("https://"): # extra_office_fields['contact_form'] = email email = None else: raise ValueError("Problematic email found: {}".format(email)) leg.add_contact_detail( type="address", value=office_address, note="Capitol Office" ) leg.add_contact_detail( type="voice", value=row["capitol phone"], note="Capitol Office" ) if email: leg.add_contact_detail(type="email", value=email) home_address = "{}\n{}, {} {}".format( row["home street address"], row["home city"], row["home state"], row["home zip code"], ) if "Legislative Office Building" not in home_address: leg.add_contact_detail( type="address", value=home_address, note="District Office" ) if row["home phone"].strip(): leg.add_contact_detail( type="voice", value=row["home phone"], note="District Office" ) leg.add_source(leg_url) for comm_name in row["committee member1"].split(";"): if " (" in comm_name: comm_name, role = comm_name.split(" (") role = role.strip(")").lower() else: role = "member" comm_name = comm_name.strip() if comm_name: if comm_name in committees: com = committees[comm_name] else: com = Organization( comm_name, classification="committee", chamber=chamber ) com.add_source(leg_url) committees[comm_name] = com yield com leg.add_membership(name_or_org=com, role=role) yield leg
def scrape(self): web_scraper = LegistarPersonScraper(requests_per_minute = self.requests_per_minute) web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody' if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member, _ in web_scraper.councilMembers(): name = member['Person Name']['label'].strip() web_info[name] = member city_council, = [body for body in self.bodies() if body['BodyName'] == 'City Council'] terms = collections.defaultdict(list) public_advocates = { # Match casing to Bill De Blasio as council member 'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio', 'The Public Advocate (Ms. James)': 'Letitia James', } for office in self.body_offices(city_council): name = office['OfficeRecordFullName'] name = public_advocates.get(name, name).strip() terms[name].append(office) # Add past members (and advocates public) if name not in web_info: web_info[name] = collections.defaultdict(lambda: None) # Check that we have everyone we expect, formatted consistently, in # both information arrays. For instance, this will fail if we forget to # strip trailing spaces from names on one side or the other (which has # the effect of omitting information, such as post, from the scrape). assert set(web_info.keys()) == set(terms.keys()) members = {} for member, offices in terms.items(): p = Person(member) web = web_info[member] for term in offices: role = term['OfficeRecordTitle'] if role == 'Public Advocate': role = 'Non-Voting Council Member' else: role = 'Council Member' district = web.get('District', '').replace(' 0', ' ') p.add_term(role, 'legislature', district=district, start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) party = web.get('Political Party') if party == 'Democrat': party = 'Democratic' if party: p.add_party(party) if web.get('Photo'): p.image = web['Photo'] contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if web.get(contact_type) and web(contact_type) != 'N/A': p.add_contact_detail(type=type_, value= web[contact_type], note=_note) if web.get('E-mail'): p.add_contact_detail(type="email", value=web['E-mail']['url'], note='E-mail') if web.get('Web site'): p.add_link(web['Web site']['url'], note='web site') if web.get('Notes'): p.extras = {'Notes': web['Notes']} if not p.sources: # Only add sources once source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p committee_types = ['Committee', 'Inactive Committee', 'Select Committee', 'Subcommittee', 'Task Force', 'Land Use', # Committee on Land Use ] body_types = {k: v for k, v in self.body_types().items() if k in committee_types} for body in self.bodies(): if body['BodyTypeName'] in body_types \ or body['BodyName'] in ('Legislative Documents Unit', 'Legal and Government Affairs Division'): # Skip typo in API data if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services': continue parent_org = PARENT_ORGS.get(body['BodyName'], 'New York City Council') body_name = body['BodyName'] o = Organization(body_name, classification='committee', parent_id={'name': parent_org}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web') for office in self.body_offices(body): # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio', # 'Committee Member', None, 'CHAIRPERSON' role = office['OfficeRecordTitle'] if role and role.lower() == 'chairperson': role = 'Chairperson' else: role = 'Member' person = office['OfficeRecordFullName'] person = public_advocates.get(person, person).strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(o, role=role, start_date=self.toDate(office['OfficeRecordStartDate']), end_date=self.toDate(office['OfficeRecordEndDate'])) yield o for p in members.values(): yield p
def scrape(self): # chambers = [chamber] if chamber is not None else ['upper', 'lower'] leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv" page = self.get(leg_url) committees = {} # Ensure that the spreadsheet's structure hasn't generally changed _row_headers = page.text.split('\r\n')[0].replace('"', '').split(',') assert _row_headers == HEADERS, "Spreadsheet structure may have changed" page = open_csv(page) for row in page: chamber = {'H': 'lower', 'S': 'upper'}[row['office code']] district = row['dist'].lstrip('0') assert district.isdigit(), "Invalid district found: {}".format(district) name = row['first name'] mid = row['middle initial'].strip() if mid: name += " %s" % mid name += " %s" % row['last name'] suffix = row['suffix'].strip() if suffix: name += " %s" % suffix party = row['party'] if party == 'Democrat': party = 'Democratic' leg = Person(primary_org=chamber, name=name, district=district, party=party ) legislator_url = row['URL'].replace('\\', '//').strip() if legislator_url != '': if not legislator_url.startswith('http'): legislator_url = 'http://' leg.add_link(legislator_url) leg.add_party(party=party) office_address = "%s\nRoom %s\nHartford, CT 06106" % ( row['capitol street address'], row['room number']) # extra_office_fields = dict() email = row['email'].strip() if "@" not in email: if not email: email = None elif email.startswith('http://') or email.startswith('https://'): # extra_office_fields['contact_form'] = email email = None else: raise ValueError("Problematic email found: {}".format(email)) leg.add_contact_detail(type='address', value=office_address, note='Capitol Office') leg.add_contact_detail(type='voice', value=row['capitol phone'], note='Capitol Office') if email: leg.add_contact_detail(type='email', value=email) home_address = "{}\n{}, {} {}".format( row['home street address'], row['home city'], row['home state'], row['home zip code'], ) if "Legislative Office Building" not in home_address: leg.add_contact_detail(type='address', value=home_address, note='District Office') if row['home phone'].strip(): leg.add_contact_detail(type='voice', value=row['home phone'], note='District Office') leg.add_source(leg_url) for comm_name in row['committee member1'].split(';'): if ' (' in comm_name: comm_name, role = comm_name.split(' (') role = role.strip(')').lower() else: role = 'member' comm_name = comm_name.strip() if comm_name: if comm_name in committees: com = committees[comm_name] else: com = Organization(comm_name, classification='committee', chamber=chamber) com.add_source(leg_url) committees[comm_name] = com yield com leg.add_membership(name_or_org=com, role=role) yield leg
def transform_parse(self, parsed_form, response): _source = { "url": response.url, "note": "LDA Form LD-1" } # basic disclosure fields _disclosure = Disclosure( effective_date=datetime.strptime( parsed_form['datetimes']['effective_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), timezone='America/New_York', submitted_date=datetime.strptime( parsed_form['datetimes']['signature_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), classification="lobbying" ) _disclosure.add_authority(name=self.authority.name, type=self.authority._type, id=self.authority._id) _disclosure.add_identifier( identifier=parsed_form['_meta']['document_id'], scheme="urn:sopr:filing" ) # disclosure extras _disclosure.extras = {} _disclosure.extras['registrant'] = { 'self_employed_individual': parsed_form['registrant']['self_employed_individual'], 'general_description': parsed_form['registrant']['registrant_general_description'], 'signature': { "signature_date": parsed_form['datetimes']['signature_date'], "signature": parsed_form['signature'] } } _disclosure.extras['client'] = { 'same_as_registrant': parsed_form['client']['client_self'], 'general_description': parsed_form['client']['client_general_description'] } _disclosure.extras['registration_type'] = { 'is_amendment': parsed_form['registration_type']['is_amendment'], 'new_registrant': parsed_form['registration_type']['new_registrant'], 'new_client_for_existing_registrant': parsed_form['registration_type'][ 'new_client_for_existing_registrant'], } # # Registrant # build registrant _registrant_self_employment = None if parsed_form['registrant']['self_employed_individual']: n = ' '.join([p for p in [ parsed_form['registrant']['registrant_individual_prefix'], parsed_form['registrant']['registrant_individual_firstname'], parsed_form['registrant']['registrant_individual_lastname'] ] if len(p) > 0]).strip() _registrant = Person( name=n, source_identified=True ) _registrant_self_employment = Organization( name='SELF-EMPLOYMENT of {n}'.format(n=n), classification='company', source_identified=True ) _registrant.add_membership( organization=_registrant_self_employment, role='self_employed', label='self-employment of {n}'.format(n=n), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: _registrant = Organization( name=parsed_form['registrant']['registrant_org_name'], classification='company', source_identified=True ) if len(parsed_form['registrant']['registrant_house_id']) > 0: _registrant.add_identifier( identifier=parsed_form['registrant']['registrant_house_id'], scheme='urn:house_clerk:registrant' ) if len(parsed_form['registrant']['registrant_senate_id']) > 0: _registrant.add_identifier( identifier=parsed_form['registrant']['registrant_senate_id'], scheme='urn:sopr:registrant' ) registrant_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ parsed_form['registrant']['registrant_address_one'], parsed_form['registrant']['registrant_address_two'], parsed_form['registrant']['registrant_city'], parsed_form['registrant']['registrant_state'], parsed_form['registrant']['registrant_zip'], parsed_form['registrant']['registrant_country']] if len(p) > 0]).strip(), }, { "type": "voice", "note": "contact phone", "value": parsed_form['registrant']['registrant_contact_phone'], }, { "type": "email", "note": "contact email", "value": parsed_form['registrant']['registrant_contact_email'], }, ] registrant_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ parsed_form['registrant']['registrant_ppb_city'], parsed_form['registrant']['registrant_ppb_state'], parsed_form['registrant']['registrant_ppb_zip'], parsed_form['registrant']['registrant_ppb_country']] if len(p) > 0]).strip(), } if registrant_contact_ppb["value"]: registrant_contact_details.append(registrant_contact_ppb) for cd in registrant_contact_details: _registrant.add_contact_detail(**cd) _registrant.extras = { "contact_details_structured": [ { "type": "address", "note": "contact address", "parts": [ { "note": "address_one", "value": parsed_form['registrant'][ 'registrant_address_one'], }, { "note": "address_two", "value": parsed_form['registrant'][ 'registrant_address_two'], }, { "note": "city", "value": parsed_form['registrant'][ 'registrant_city'], }, { "note": "state", "value": parsed_form['registrant'][ 'registrant_state'], }, { "note": "zip", "value": parsed_form['registrant'][ 'registrant_zip'], }, { "note": "country", "value": parsed_form['registrant'][ 'registrant_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": parsed_form['registrant'][ 'registrant_ppb_city'], }, { "note": "state", "value": parsed_form['registrant'][ 'registrant_ppb_state'], }, { "note": "zip", "value": parsed_form['registrant'][ 'registrant_ppb_zip'], }, { "note": "country", "value": parsed_form['registrant'][ 'registrant_ppb_country'], } ], }, ] } # # People # build contact _main_contact = Person( name=parsed_form['registrant']['registrant_contact_name'], source_identified=True ) main_contact_contact_details = [ { "type": "voice", "note": "contact phone", "value": parsed_form['registrant']['registrant_contact_phone'], }, { "type": "email", "note": "contact email", "value": parsed_form['registrant']['registrant_contact_email'], } ] for cd in main_contact_contact_details: _main_contact.add_contact_detail(**cd) if _registrant._type == 'organization': _registrant.add_member( name_or_person=_main_contact, role='main_contact', label='main contact for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: _registrant_self_employment.add_member( name_or_person=_main_contact, role='main_contact', label='main contact for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) # # Client # build client _client = Organization( name=parsed_form['client']['client_name'], classification='company', source_identified=True ) client_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ parsed_form['client']['client_address'], parsed_form['client']['client_city'], parsed_form['client']['client_state'], parsed_form['client']['client_zip'], parsed_form['client']['client_country']] if len(p) > 0]).strip(), }, ] client_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ parsed_form['client']['client_ppb_city'], parsed_form['client']['client_ppb_state'], parsed_form['client']['client_ppb_zip'], parsed_form['client']['client_ppb_country']] if len(p) > 0]).strip(), } if client_contact_ppb["value"]: client_contact_details.append(client_contact_ppb) for cd in client_contact_details: _client.add_contact_detail(**cd) _client.extras = { "contact_details_structured": [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": parsed_form['client']['client_address'], }, { "note": "city", "value": parsed_form['client']['client_city'], }, { "note": "state", "value": parsed_form['client']['client_state'], }, { "note": "zip", "value": parsed_form['client']['client_zip'], }, { "note": "country", "value": parsed_form['client']['client_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": parsed_form['client']['client_ppb_city'], }, { "note": "state", "value": parsed_form['client']['client_ppb_state'], }, { "note": "zip", "value": parsed_form['client']['client_ppb_zip'], }, { "note": "country", "value": parsed_form['client'][ 'client_ppb_country'], } ], }, ], } # Collect Foreign Entities _foreign_entities = [] _foreign_entities_by_name = {} for fe in parsed_form['foreign_entities']: fe_extras = {} fe_name = fe['foreign_entity_name'] # check for name-based duplicates if fe_name in _foreign_entities_by_name: _foreign_entity = _foreign_entities_by_name[fe_name] else: _foreign_entity = Organization( name=fe_name, classification='company', source_identified=True ) # collect contact details foreign_entity_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ fe['foreign_entity_address'], fe['foreign_entity_city'], fe['foreign_entity_state'], fe['foreign_entity_country']] if len(p) > 0]).strip(), }, { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ fe['foreign_entity_ppb_state'], fe['foreign_entity_ppb_country']] if len(p) > 0]).strip(), }, ] foreign_entity_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ fe['foreign_entity_ppb_city'], fe['foreign_entity_ppb_state'], fe['foreign_entity_ppb_country']] if len(p) > 0]), } if foreign_entity_contact_ppb["value"]: foreign_entity_contact_details.append( foreign_entity_contact_ppb) # add contact details for cd in foreign_entity_contact_details: if cd['value'] != '': _foreign_entity.add_contact_detail(**cd) # add extras fe_extras["contact_details_structured"] = [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": fe['foreign_entity_address'], }, { "note": "city", "value": fe['foreign_entity_city'], }, { "note": "state", "value": fe['foreign_entity_state'], }, { "note": "country", "value": fe['foreign_entity_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "state", "value": fe['foreign_entity_ppb_state'], }, { "note": "country", "value": fe['foreign_entity_ppb_country'], } ], }, ] _foreign_entity.extras = combine_dicts(_foreign_entity.extras, fe_extras) _foreign_entities_by_name[fe_name] = _foreign_entity for unique_foreign_entity in _foreign_entities_by_name.values(): _foreign_entities.append(unique_foreign_entity) # TODO: add a variant on memberships to represent inter-org # relationships (associations, ownership, etc) # # _client['memberships'].append({ # "id": _foreign_entity['id'], # "classification": "organization", # "name": _foreign_entity['name'], # "extras": { # "ownership_percentage": # fe['foreign_entity_amount'] # } # }) # Collect Lobbyists # TODO: deal with wierd non-name line continuation cases (blanks, "continued") _lobbyists_by_name = {} for l in parsed_form['lobbyists']: l_extras = {} l_name = ' '.join([l['lobbyist_first_name'], l['lobbyist_last_name'], l['lobbyist_suffix'] ]).strip() if l_name in _lobbyists_by_name: _lobbyist = _lobbyists_by_name[l_name] else: _lobbyist = Person( name=l_name, source_identified=True ) if l['lobbyist_covered_official_position']: l_extras['lda_covered_official_positions'] = [ { 'date_reported': parsed_form['datetimes']['effective_date'], 'covered_official_position': l['lobbyist_covered_official_position'] }, ] _lobbyist.extras = combine_dicts(_lobbyist.extras, l_extras) _lobbyists_by_name[l_name] = _lobbyist _lobbyists = [] for unique_lobbyist in _lobbyists_by_name.values(): _lobbyists.append(unique_lobbyist) if _registrant._type == 'organization': for l in _lobbyists: _registrant.add_member( l, role='lobbyist', label='lobbyist for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: for l in _lobbyists: _registrant_self_employment.add_member( l, role='lobbyist', label='lobbyist for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) # # Document # build document _disclosure.add_document( note='submitted filing', date=parsed_form['datetimes']['effective_date'][:10], url=response.url ) # Collect Affiliated orgs _affiliated_organizations = [] _affiliated_organizations_by_name = {} for ao in parsed_form['affiliated_organizations']: ao_extras = {} ao_name = ao['affiliated_organization_name'] if ao_name in _affiliated_organizations_by_name: # There's already one by this name _affiliated_organization = _affiliated_organizations_by_name[ao_name] else: # New affiliated org _affiliated_organization = Organization( name=ao_name, classification='company', source_identified=True ) # collect contact details affiliated_organization_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ ao['affiliated_organization_address'], ao['affiliated_organization_city'], ao['affiliated_organization_state'], ao['affiliated_organization_zip'], ao['affiliated_organization_country']] if len(p) > 0]).strip(), }, ] affiliated_organization_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ ao['affiliated_organization_ppb_city'], ao['affiliated_organization_ppb_state'], ao['affiliated_organization_ppb_country']] if len(p) > 0]).strip(), } if affiliated_organization_contact_ppb["value"]: affiliated_organization_contact_details.append( affiliated_organization_contact_ppb) # add contact details for cd in affiliated_organization_contact_details: _affiliated_organization.add_contact_detail(**cd) ao_extras["contact_details_structured"] = [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": ao['affiliated_organization_address'], }, { "note": "city", "value": ao['affiliated_organization_city'], }, { "note": "state", "value": ao['affiliated_organization_state'], }, { "note": "zip", "value": ao['affiliated_organization_zip'], }, { "note": "country", "value": ao['affiliated_organization_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": ao['affiliated_organization_ppb_city'], }, { "note": "state", "value": ao['affiliated_organization_ppb_state'], }, { "note": "country", "value": ao['affiliated_organization_ppb_country'], } ], }, ], _affiliated_organization.extras = combine_dicts( _affiliated_organization.extras, ao_extras) for unique_affiliated_organization in _affiliated_organizations_by_name.values(): _affiliated_organizations.append(unique_affiliated_organization) # # Events & Agendas # name if parsed_form['registration_type']['new_registrant']: registration_type = 'New Client, New Registrant' elif parsed_form['registration_type']['is_amendment']: registration_type = 'Amended Registration' else: registration_type = 'New Client for Existing Registrant' # Create registration event _event = Event( name="{rn} - {rt}, {cn}".format(rn=_registrant.name, rt=registration_type, cn=_client.name), timezone='America/New_York', location='United States', start_time=datetime.strptime( parsed_form['datetimes']['effective_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), classification='registration' ) # add participants _event.add_participant(type=_registrant._type, id=_registrant._id, name=_registrant.name, note="registrant") if _registrant._type == 'person': _event.add_participant(type=_registrant._type, id=_registrant._id, name=_registrant.name, note="registrant") _event.add_participant(type=_client._type, id=_client._id, name=_client.name, note="client") for l in _lobbyists: _event.add_participant(type=l._type, id=l._id, name=l.name, note='lobbyist') for fe in _foreign_entities: _event.add_participant(type=fe._type, id=fe._id, name=fe.name, note='foreign_entity') for ao in _affiliated_organizations: _event.add_participant(type=ao._type, id=ao._id, name=ao.name, note='affiliated_organization') # add agenda item _agenda = _event.add_agenda_item( description='issues lobbied on', ) _agenda['notes'].append( parsed_form['lobbying_issues_detail'] ) for li in parsed_form['lobbying_issues']: if li['general_issue_area'] != '': _agenda.add_subject(li['general_issue_area']) _disclosure.add_disclosed_event( name=_event.name, type=_event._type, classification=_event.classification, id=_event._id ) # add registrant to disclosure's _related and related_entities fields _disclosure.add_registrant(name=_registrant.name, type=_registrant._type, id=_registrant._id) _registrant.add_source( url=_source['url'], note='registrant' ) yield _registrant if _registrant_self_employment is not None: _registrant_self_employment.add_source( url=_source['url'], note='registrant_self_employment' ) yield _registrant_self_employment _client.add_source( url=_source['url'], note='client' ) yield _client _main_contact.add_source( url=_source['url'], note='main_contact' ) yield _main_contact for ao in _affiliated_organizations: ao.add_source( url=_source['url'], note='affiliated_organization' ) yield ao for fe in _foreign_entities: fe.add_source( url=_source['url'], note='foreign_entity' ) yield fe for l in _lobbyists: l.add_source( url=_source['url'], note='lobbyist' ) yield l _event.add_source(**_source) yield _event _disclosure.add_source(**_source) yield _disclosure
def scrape_chamber(self, chamber): body = {'lower': 'H', 'upper': 'S'}[chamber] url = 'http://www.azleg.gov/MemberRoster/?body=' + body page = self.get(url).text # there is a bad comment closing tag on this page page = page.replace('--!>', '-->') root = html.fromstring(page) path = '//table//tr' roster = root.xpath(path)[1:] for row in roster: position = '' name, district, party, email, room, phone, = row.xpath('td') if email.attrib.get('class') == 'vacantmember': continue # Skip any vacant members. link = name.xpath('string(a/@href)') if len(name) == 1: name = name.text_content().strip() else: position = name.tail.strip() name = name[0].text_content().strip() if '--' in name: name = name.split('--')[0].strip() linkpage = self.get(link).text linkpage = linkpage.replace('--!>', '-->') linkroot = html.fromstring(linkpage) linkroot.make_links_absolute(link) photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]") if len(photos) != 1: self.warning('no photo on ' + link) photo_url = '' else: photo_url = photos[0].attrib['src'] district = district.text_content().strip() party = party.text_content().strip() email = email.text_content().strip() if email.startswith('Email: '): email = email.replace('Email: ', '').lower() + '@azleg.gov' else: email = '' party = self.get_party(party) room = room.text_content().strip() if chamber == 'lower': address = "House of Representatives\n" else: address = "Senate\n" address = address + "1700 West Washington\n Room " + room \ + "\nPhoenix, AZ 85007" phone = phone.text_content().strip() if '602' not in re.findall(r'(\d+)', phone): phone = "602-" + phone leg = Person(primary_org=chamber, image=photo_url, name=name, district=district, party=party) leg.add_contact_detail(type='address', value=address, note='Capitol Office') leg.add_contact_detail(type='voice', value=phone, note='Capitol Office') leg.add_party(party=party) leg.add_link(link) if email: leg.add_contact_detail(type='email', value=email) if position: leg.add_membership(name_or_org=party, role=position) # leg.add_role(position, term, chamber=chamber, # district=district, party=party) leg.add_source(url) # Probably just get this from the committee scraper # self.scrape_member_page(link, session, chamber, leg) yield leg
def scrape(self): url = 'http://alpha.openstates.org/graphql' scrapers = [ { 'query': '{ people(memberOf:"ocd-organization/e91db6f8-2232-49cd-91af-fdb5adb4ac3b", first: 100) { edges { node { name party: currentMemberships(classification:"party") { organization { name }} links { url } sources { url } chamber: currentMemberships(classification:["upper", "lower"]) { post { label } organization { name classification parent { name }}}}}}}' }, # { 'query': '{ people(memberOf:"ocd-organization/e91db6f8-2232-49cd-91af-fdb5adb4ac3b", last: 100) { edges { node { name party: currentMemberships(classification:"party") { organization { name }} links { url } sources { url } chamber: currentMemberships(classification:["upper", "lower"]) { post { label } organization { name classification parent { name }}}}}}}'}, { 'query': '{ people(memberOf:"ocd-organization/6a026144-758d-4d57-b856-9c60dce3c4b5", first: 100) { edges { node { name party: currentMemberships(classification:"party") { organization { name }} links { url } sources { url } chamber: currentMemberships(classification:["upper", "lower"]) { post { label } organization { name classification parent { name }}}}}}}' }, ] base = requests.get(url=url, json=scrapers[0]) base = base.json() ppl = base['data']['people']['edges'] for p in ppl: p = p['node'] if p['name'] in rep_names: rep_names.remove(p['name']) # Get names unretrieved from primary House API Query print('REP NAMES: ', rep_names) rep_names.remove('Gene Pelowski') for rep in rep_names: query = '{ people(memberOf:"ocd-organization/e91db6f8-2232-49cd-91af-fdb5adb4ac3b", first: 100, name: "' + rep + '") { edges { node { name party: currentMemberships(classification:"party") { organization { name }} links { url } sources { url } chamber: currentMemberships(classification:["upper", "lower"]) { post { label } organization { name classification parent { name }}}}}}}' query = {'query': query} scrapers.append(query) for s in scrapers: base = requests.get(url=url, json=s) base = base.json() print(base) ppl = base['data']['people']['edges'] for p in ppl: p = p['node'] orgs = p['chamber'] rep = Person(name=p['name'], role='State Representative') for o in orgs: ppr(o) name = o['organization']['name'] classification = o['organization']['classification'] if o['organization']['parent']: pname = o['organization']['parent']['name'] if pname == 'Minnesota Legislature': label = o['post']['label'] if 'House' in name: role = 'State Representative' elif 'Senate' in name: role = 'State Senator' rep.add_term(role, classification, district=label, org_name=name) rep.add_source(p['sources'][0]['url']) else: rep.add_membership(name) rep.add_source(p['sources'][0]['url']) yield rep
def scrape(self): # chambers = [chamber] if chamber is not None else ['upper', 'lower'] leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv" page = self.get(leg_url) committees = {} # Ensure that the spreadsheet's structure hasn't generally changed _row_headers = page.text.split('\r\n')[0].replace('"', '').split(',') assert _row_headers == HEADERS, "Spreadsheet structure may have changed" page = open_csv(page) for row in page: chamber = {'H': 'lower', 'S': 'upper'}[row['office code']] district = row['dist'].lstrip('0') assert district.isdigit(), "Invalid district found: {}".format( district) name = row['first name'] mid = row['middle initial'].strip() if mid: name += " %s" % mid name += " %s" % row['last name'] suffix = row['suffix'].strip() if suffix: name += " %s" % suffix party = row['party'] if party == 'Democrat': party = 'Democratic' leg = Person(primary_org=chamber, name=name, district=district, party=party) legislator_url = row['URL'].replace('\\', '//').strip() if legislator_url != '': if not legislator_url.startswith('http'): legislator_url = 'http://' leg.add_link(legislator_url) leg.add_party(party=party) office_address = "%s\nRoom %s\nHartford, CT 06106" % ( row['capitol street address'], row['room number']) # extra_office_fields = dict() email = row['email'].strip() if "@" not in email: if not email: email = None elif email.startswith('http://') or email.startswith( 'https://'): # extra_office_fields['contact_form'] = email email = None else: raise ValueError( "Problematic email found: {}".format(email)) leg.add_contact_detail(type='address', value=office_address, note='Capitol Office') leg.add_contact_detail(type='voice', value=row['capitol phone'], note='Capitol Office') if email: leg.add_contact_detail(type='email', value=email) home_address = "{}\n{}, {} {}".format( row['home street address'], row['home city'], row['home state'], row['home zip code'], ) if "Legislative Office Building" not in home_address: leg.add_contact_detail(type='address', value=home_address, note='District Office') if row['home phone'].strip(): leg.add_contact_detail(type='voice', value=row['home phone'], note='District Office') leg.add_source(leg_url) for comm_name in row['committee member1'].split(';'): if ' (' in comm_name: comm_name, role = comm_name.split(' (') role = role.strip(')').lower() else: role = 'member' comm_name = comm_name.strip() if comm_name: if comm_name in committees: com = committees[comm_name] else: com = Organization(comm_name, classification='committee', chamber=chamber) com.add_source(leg_url) committees[comm_name] = com yield com leg.add_membership(name_or_org=com, role=role) yield leg
def scrape_legislator_page(self, term, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) name = page.xpath("//h1[@id='page-title']/text()")[0] name = re.sub(r'^(Representative|Senator)\s', '', name) district = page.xpath("//a[contains(@href, 'district')]/text()")[0] district = district.replace("District", "").strip() committees = page.xpath("//a[contains(@href, 'committees')]/text()") photo = page.xpath( "//div[@class='field-person-photo']/img/@src" ) photo = photo[0] if len(photo) else None address = page.xpath("//div[@class='adr']") if address: address = address[0] address = re.sub("[ \t]+", " ", address.text_content()).strip() else: address = None item_mapping = { "email": "email", "home telephone": "home-telephone", "cellphone": "cellphone", "office telephone": "office-telephone", "political party": "party", "chamber": "chamber", "fax": "fax" } metainf = {} for block in page.xpath("//div[contains(@class, 'field-label-inline')]"): label, items = block.xpath("./*") key = label.text_content().strip().lower() if key.endswith(":"): key = key[:-1] metainf[item_mapping[key]] = items.text_content().strip() chamber = { "Senate": "upper", "House": "lower" }[metainf['chamber']] party = {"Democrat": "Democratic", "Republican": "Republican"}[metainf['party']] person = Person(primary_org=chamber, district=district, name=name, party=party, image=photo) person.add_link(url) for key, person_key in [('email', 'email'), ('fax', 'fax'), ('office-telephone', 'voice')]: if key in metainf: if metainf[key].strip(): person.add_contact_detail(type=person_key, value=metainf[key], note="Capitol Office") if address: person.add_contact_detail(type='address', value=address, note="District Office") if 'cellphone' in metainf: person.add_contact_detail(type='voice', value=metainf['cellphone'], note="District Office") if 'home-telephone' in metainf: person.add_contact_detail(type='voice', value=metainf['home-telephone'], note="District Office") for committee in committees: person.add_membership(name_or_org=committee, role='committee member') person.add_source(url) yield person
def scrape(self): ''' Scrape the web to create a dict with all active organizations. Then, we can access the correct URL for the organization detail page. ''' web_scraper = LegistarPersonScraper( requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = 'https://metro.legistar.com/People.aspx' web_info = {} for _, organizations in web_scraper.councilMembers(): for organization, _, _ in organizations: organization_name = organization['Department Name'][ 'label'].strip() organization_info = organization['Department Name'] web_info[organization_name] = organization_info body_types = self.body_types() board_of_directors, = [ body for body in self.bodies() if body['BodyName'] == 'Board of Directors - Regular Board Meeting' ] board_of_directors["BodyName"] = "Board of Directors" terms = collections.defaultdict(list) for office in self.body_offices(board_of_directors): terms[office['OfficeRecordFullName']].append(office) members = {} for member, offices in terms.items(): p = Person(member) for term in offices: role = term['OfficeRecordTitle'] if role not in {'Board Member', 'non-voting member'}: p.add_term( role, 'legislature', start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate']), appointment=True) if role != 'Chief Executive Officer': if role == 'non-voting member': member_type = 'Nonvoting Board Member' post = NONVOTING_POSTS.get(member) else: member_type = 'Board Member' post = VOTING_POSTS.get(member) start_date = self.toDate(term['OfficeRecordStartDate']) end_date = self.toDate(term['OfficeRecordEndDate']) board_membership = p.add_term(member_type, 'legislature', district=post, start_date=start_date, end_date=end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get( p.name) if acting_member_end_date and acting_member_end_date <= end_date: board_membership.extras = {'acting': 'true'} # Each term contains first and last names. This should be the same # across all of a person's terms, so go ahead and grab them from the # last term in the array. p.family_name = term['OfficeRecordLastName'] p.given_name = term['OfficeRecordFirstName'] # Defensively assert that the given and family names match the # expected value. if member == 'Hilda L. Solis': # Given/family name does not contain middle initial. assert p.given_name == 'Hilda' and p.family_name == 'Solis' else: assert member == ' '.join([p.given_name, p.family_name]) source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p for body in self.bodies(): if body['BodyTypeId'] in ( body_types['Committee'], body_types['Independent Taxpayer Oversight Committee']): organization_name = body['BodyName'].strip() o = Organization(organization_name, classification='committee', parent_id={'name': 'Board of Directors'}) organization_info = web_info.get(organization_name, {}) organization_url = organization_info.get( 'url', self.WEB_URL + 'https://metro.legistar.com/Departments.aspx') o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(organization_url, note='web') for office in self.body_offices(body): role = office['OfficeRecordTitle'] if role not in BOARD_OFFICE_ROLES: if role == 'non-voting member': role = 'Nonvoting Member' else: role = 'Member' person = office['OfficeRecordFullName'] if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p start_date = self.toDate(office['OfficeRecordStartDate']) end_date = self.toDate(office['OfficeRecordEndDate']) membership = p.add_membership(organization_name, role=role, start_date=start_date, end_date=end_date) acting_member_end_date = ACTING_MEMBERS_WITH_END_DATE.get( p.name) if acting_member_end_date and acting_member_end_date <= end_date: membership.extras = {'acting': 'true'} yield o for p in members.values(): yield p
def scrape_chamber(self, chamber): if chamber == "lower": url = "http://www.scstatehouse.gov/member.php?chamber=H" else: url = "http://www.scstatehouse.gov/member.php?chamber=S" seen_committees = {} data = self.get(url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(url) for a in doc.xpath('//a[@class="membername"]'): full_name = a.text leg_url = a.get("href") if full_name.startswith("Senator"): full_name = full_name.replace("Senator ", "") if full_name.startswith("Representative"): full_name = full_name.replace("Representative ", "") leg_html = self.get(leg_url).text leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) if "Resigned effective" in leg_html: self.info("Resigned") continue party, district, _ = leg_doc.xpath( '//p[@style="font-size: 17px;' ' margin: 0 0 0 0; padding: 0;"]/text()') if "Republican" in party: party = "Republican" elif "Democrat" in party: party = "Democratic" # District # - County - Map district = district.split()[1] try: photo_url = leg_doc.xpath( '//img[contains(@src,"/members/")]/@src')[0] except IndexError: self.warning("No Photo URL for {}".format(full_name)) photo_url = "" person = Person( name=full_name, district=district, party=party, primary_org=chamber, image=photo_url, ) # office address / phone try: addr_div = leg_doc.xpath( '//div[@style="float: left; width: 225px;' ' margin: 10px 5px 0 20px; padding: 0;"]')[0] capitol_address = addr_div.xpath( 'p[@style="font-size: 13px;' ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content() phone = addr_div.xpath( 'p[@style="font-size: 13px;' ' margin: 0 0 0 0; padding: 0;"]/text()')[0] capitol_phone = phone.strip() if capitol_address: person.add_contact_detail(type="address", value=capitol_address, note="Capitol Office") if capitol_phone: person.add_contact_detail(type="voice", value=capitol_phone, note="Capitol Office") except IndexError: self.warning("no capitol address for {0}".format(full_name)) # home address / phone try: addr_div = leg_doc.xpath( '//div[@style="float: left;' ' width: 225px; margin: 10px 0 0 20px;"]')[0] addr = addr_div.xpath( 'p[@style="font-size: 13px;' ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content() phone = addr_div.xpath( 'p[@style="font-size: 13px;' ' margin: 0 0 0 0; padding: 0;"]/text()')[0] phone = phone.strip() if addr: person.add_contact_detail(type="address", value=addr, note="District Office") if phone: person.add_contact_detail(type="voice", value=phone, note="District Office") except IndexError: self.warning("no district address for {0}".format(full_name)) person.add_link(leg_url) person.add_source(url) person.add_source(leg_url) # committees (skip first link) for com in leg_doc.xpath( '//a[contains(@href, "committee.php")]')[1:]: if com.text.endswith(", "): committee, role = com.text_content().rsplit(", ", 1) # known roles role = { "Treas.": "treasurer", "Secy.": "secretary", "Secy./Treas.": "secretary/treasurer", "V.C.": "vice-chair", "1st V.C.": "first vice-chair", "Co 1st V.C.": "co-first vice-chair", "2nd V.C.": "second vice-chair", "3rd V.C.": "third vice-chair", "Ex.Officio Member": "ex-officio member", "Chairman": "chairman", }[role] else: committee = com.text role = "member" # only yield each committee once if committee not in seen_committees: com = Organization(name=committee, classification="committee", chamber=chamber) com.add_source(url) seen_committees[committee] = com yield com else: com = seen_committees[committee] person.add_membership(com, role=role) yield person
def scrape_chamber(self, chamber): body = {"lower": "H", "upper": "S"}[chamber] url = "http://www.azleg.gov/MemberRoster/?body=" + body page = self.get(url).text # there is a bad comment closing tag on this page page = page.replace("--!>", "-->") root = html.fromstring(page) path = "//table//tr" roster = root.xpath(path)[1:] for row in roster: position = "" name, district, party, email, room, phone, = row.xpath("td") if email.attrib.get("class") == "vacantmember": continue # Skip any vacant members. link = name.xpath("string(a/@href)") if len(name) == 1: name = name.text_content().strip() else: position = name.tail.strip() name = name[0].text_content().strip() if "--" in name: name = name.split("--")[0].strip() linkpage = self.get(link).text linkpage = linkpage.replace("--!>", "-->") linkroot = html.fromstring(linkpage) linkroot.make_links_absolute(link) photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]") if len(photos) != 1: self.warning("no photo on " + link) photo_url = "" else: photo_url = photos[0].attrib["src"] district = district.text_content().strip() party = party.text_content().strip() email = email.text_content().strip() if email.startswith("Email: "): email = email.replace("Email: ", "").lower() + "@azleg.gov" else: email = "" party = self.get_party(party) room = room.text_content().strip() if chamber == "lower": address = "House of Representatives\n" else: address = "Senate\n" address = (address + "1700 West Washington\n Room " + room + "\nPhoenix, AZ 85007") phone = phone.text_content().strip() if "602" not in re.findall(r"(\d+)", phone): phone = "602-" + phone leg = Person( primary_org=chamber, image=photo_url, name=name, district=district, party=party, ) leg.add_contact_detail(type="address", value=address, note="Capitol Office") leg.add_contact_detail(type="voice", value=phone, note="Capitol Office") leg.add_party(party=party) leg.add_link(link) if email: leg.add_contact_detail(type="email", value=email) if position: leg.add_membership(name_or_org=party, role=position) # leg.add_role(position, term, chamber=chamber, # district=district, party=party) leg.add_source(url) # Probably just get this from the committee scraper # self.scrape_member_page(link, session, chamber, leg) yield leg
def scrape_chamber(self, chamber): if chamber == 'lower': url = 'http://www.scstatehouse.gov/member.php?chamber=H' else: url = 'http://www.scstatehouse.gov/member.php?chamber=S' seen_committees = {} data = self.get(url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(url) for a in doc.xpath('//a[contains(@href, "code=")]'): full_name = a.text leg_url = a.get('href') leg_html = self.get(leg_url).text leg_doc = lxml.html.fromstring(leg_html) leg_doc.make_links_absolute(leg_url) if 'Resigned effective' in leg_html: self.info('Resigned') continue party, district, _ = leg_doc.xpath('//p[@style="font-size: 17px;' ' margin: 0 0 0 0; padding: 0;"]/text()') if 'Republican' in party: party = 'Republican' elif 'Democrat' in party: party = 'Democratic' # District # - County - Map district = district.split()[1] photo_url = leg_doc.xpath('//img[contains(@src,"/members/")]/@src')[0] person = Person(name=full_name, district=district, party=party, primary_org=chamber, image=photo_url) # office address / phone try: addr_div = leg_doc.xpath('//div[@style="float: left; width: 225px;' ' margin: 10px 5px 0 20px; padding: 0;"]')[0] capitol_address = addr_div.xpath('p[@style="font-size: 13px;' ' margin: 0 0 10px 0; padding: 0;"]' )[0].text_content() phone = addr_div.xpath('p[@style="font-size: 13px;' ' margin: 0 0 0 0; padding: 0;"]/text()')[0] capitol_phone = phone.strip() if capitol_address: person.add_contact_detail(type='address', value=capitol_address, note='Capitol Office') if capitol_phone: person.add_contact_detail(type='voice', value=capitol_phone, note='Capitol Office') except IndexError: self.warning('no capitol address for {0}'.format(full_name)) # home address / phone try: addr_div = leg_doc.xpath('//div[@style="float: left;' ' width: 225px; margin: 10px 0 0 20px;"]')[0] addr = addr_div.xpath('p[@style="font-size: 13px;' ' margin: 0 0 10px 0; padding: 0;"]')[0].text_content() phone = addr_div.xpath('p[@style="font-size: 13px;' ' margin: 0 0 0 0; padding: 0;"]/text()')[0] phone = phone.strip() if addr: person.add_contact_detail(type='address', value=addr, note='District Office') if phone: person.add_contact_detail(type='voice', value=phone, note='District Office') except IndexError: self.warning('no district address for {0}'.format(full_name)) person.add_link(leg_url) person.add_source(url) person.add_source(leg_url) # committees (skip first link) for com in leg_doc.xpath('//a[contains(@href, "committee.php")]')[1:]: if com.text.endswith(', '): committee, role = com.text_content().rsplit(', ', 1) # known roles role = {'Treas.': 'treasurer', 'Secy.': 'secretary', 'Secy./Treas.': 'secretary/treasurer', 'V.C.': 'vice-chair', '1st V.C.': 'first vice-chair', 'Co 1st V.C.': 'co-first vice-chair', '2nd V.C.': 'second vice-chair', '3rd V.C.': 'third vice-chair', 'Ex.Officio Member': 'ex-officio member', 'Chairman': 'chairman'}[role] else: committee = com.text role = 'member' # only yield each committee once if committee not in seen_committees: com = Organization(name=committee, classification='committee', chamber=chamber) com.add_source(url) seen_committees[committee] = com yield com else: com = seen_committees[committee] person.add_membership(com, role=role) yield person