def test_person_add_party(): p = Person('Groot') p.add_party('Green') p._related[0].validate() assert get_pseudo_id(p._related[0].organization_id) == { 'name': 'Green', 'classification': 'party' }
def scrape(self): current_path = Path(__file__) legislator_path = current_path.parent / 'congress-legislators/legislators-historical.yaml' with legislator_path.open() as f: legislators = yaml.load(f, Loader=yaml.CLoader) for legislator in legislators: if all(term['end'] < '1970' for term in legislator['terms']): continue l = Person(name=' '.join( (legislator['name']['first'], legislator['name']['last'])), birth_date=legislator['bio'].get('birthday', ''), gender=legislator['bio']['gender']) parties = set() for term in legislator['terms']: state = term['state'] parties.add(term['party']) if term['type'] == 'rep': role = 'Representative' district_name = self._district_name( state, term['district']) chamber = 'lower' else: role = "Senator" district_name = "{state}, Class {klass}".format( state=state, klass=term['class']) chamber = 'upper' l.add_term(role, chamber, district=district_name, start_date=term['start'], end_date=term['end']) for party in parties: l.add_party(party) for scheme, identifier in legislator['id'].items(): l.add_identifier(str(identifier), scheme=scheme) l.add_source( 'https://github.com/unitedstates/congress-legislators/blob/master/legislators-historical.yaml' ) yield l
def scrape(self): noncommittees = {'Committee of the Whole'} committee_d = {} people_d = {} # Go to memberlist extra_args = {'ctl00$ContentPlaceHolder$lstName': 'City Council'} for councilman, committees in self.councilMembers( extra_args=extra_args): if 'url' in councilman['Person Name']: councilman_url = councilman['Person Name']['url'] if councilman_url in people_d: people_d[councilman_url][0].append(councilman) else: people_d[councilman_url] = [councilman], committees for person_entries, committees in people_d.values(): councilman = person_entries[-1] p = Person(councilman['Person Name']['label']) if p.name == 'Letitia James': p.name = 'Letitia Ms. James' p.add_name('Letitia James') spans = [(self.toTime(entry['Start Date']).date(), self.toTime(entry['End Date']).date(), entry['District']) for entry in person_entries] merged_spans = [] last_end_date = None last_district = None for start_date, end_date, district in sorted(spans): if last_end_date is None: span = [start_date, end_date, district] elif (start_date - last_end_date ) == datetime.timedelta(1) and district == last_district: span[1] = end_date else: merged_spans.append(span) span = [start_date, end_date, district] last_end_date = end_date last_district = district merged_spans.append(span) for start_date, end_date, district in merged_spans: district = councilman['District'].replace(' 0', ' ') if end_date == datetime.date(2017, 12, 31): end_date = '' else: end_date = end_date.isoformat() print(start_date, end_date) p.add_term('Council Member', 'legislature', district=district, start_date=start_date.isoformat(), end_date=end_date) party = councilman['Political Party'] if party == 'Democrat': party = 'Democratic' if party: p.add_party(party) if councilman['Photo']: p.image = councilman['Photo'] if councilman["E-mail"]: p.add_contact_detail(type="email", value=councilman['E-mail']['url'], note='E-mail') if councilman['Web site']: p.add_link(councilman['Web site']['url'], note='web site') p.extras = {'Notes': councilman['Notes']} p.add_source(councilman['Person Name']['url'], note='web') for committee, _, _ in committees: committee_name = committee['Department Name']['label'] if committee_name not in noncommittees and 'committee' in committee_name.lower( ): o = committee_d.get(committee_name, None) if o is None: parent_id = PARENT_ORGS.get(committee_name, 'New York City Council') o = Organization(committee_name, classification='committee', parent_id={'name': parent_id}) o.add_source(committee['Department Name']['url']) committee_d[committee_name] = o membership = o.add_member(p, role=committee["Title"]) membership.start_date = self.mdY2Ymd( committee["Start Date"]) yield p for o in committee_d.values(): if 'Committee' in o.name: yield o for o in committee_d.values(): if 'Subcommittee' in o.name: yield o o = Organization( 'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services', classification='committee', parent_id={'name': 'New York City Council'}) o.add_source("http://legistar.council.nyc.gov/Departments.aspx") yield o o = Organization( 'Subcommittee on Drug Abuse', classification='committee', parent_id={ 'name': 'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services' }) o.add_source("http://legistar.council.nyc.gov/Departments.aspx") yield o
def scrape(self): # chambers = [chamber] if chamber is not None else ['upper', 'lower'] leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv" page = self.get(leg_url) committees = {} # Ensure that the spreadsheet's structure hasn't generally changed _row_headers = page.text.split("\r\n")[0].replace('"', "").split(",") assert _row_headers == HEADERS, "Spreadsheet structure may have changed" page = open_csv(page) for row in page: chamber = {"H": "lower", "S": "upper"}[row["office code"]] district = row["dist"].lstrip("0") assert district.isdigit(), "Invalid district found: {}".format(district) name = row["first name"] mid = row["middle initial"].strip() if mid: name += " %s" % mid name += " %s" % row["last name"] suffix = row["suffix"].strip() if suffix: name += " %s" % suffix party = row["party"] if party == "Democrat": party = "Democratic" leg = Person(primary_org=chamber, name=name, district=district, party=party) legislator_url = row["URL"].replace("\\", "//").strip() if legislator_url != "": if not legislator_url.startswith("http"): legislator_url = "http://" leg.add_link(legislator_url) leg.add_party(party=party) office_address = "%s\nRoom %s\nHartford, CT 06106" % ( row["capitol street address"], row["room number"], ) # extra_office_fields = dict() email = row["email"].strip() if "@" not in email: if not email: email = None elif email.startswith("http://") or email.startswith("https://"): # extra_office_fields['contact_form'] = email email = None else: raise ValueError("Problematic email found: {}".format(email)) leg.add_contact_detail( type="address", value=office_address, note="Capitol Office" ) leg.add_contact_detail( type="voice", value=row["capitol phone"], note="Capitol Office" ) if email: leg.add_contact_detail(type="email", value=email) home_address = "{}\n{}, {} {}".format( row["home street address"], row["home city"], row["home state"], row["home zip code"], ) if "Legislative Office Building" not in home_address: leg.add_contact_detail( type="address", value=home_address, note="District Office" ) if row["home phone"].strip(): leg.add_contact_detail( type="voice", value=row["home phone"], note="District Office" ) leg.add_source(leg_url) for comm_name in row["committee member1"].split(";"): if " (" in comm_name: comm_name, role = comm_name.split(" (") role = role.strip(")").lower() else: role = "member" comm_name = comm_name.strip() if comm_name: if comm_name in committees: com = committees[comm_name] else: com = Organization( comm_name, classification="committee", chamber=chamber ) com.add_source(leg_url) committees[comm_name] = com yield com leg.add_membership(name_or_org=com, role=role) yield leg
def scrape(self): # chambers = [chamber] if chamber is not None else ['upper', 'lower'] leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv" page = self.get(leg_url) committees = {} # Ensure that the spreadsheet's structure hasn't generally changed _row_headers = page.text.split('\r\n')[0].replace('"', '').split(',') assert _row_headers == HEADERS, "Spreadsheet structure may have changed" page = open_csv(page) for row in page: chamber = {'H': 'lower', 'S': 'upper'}[row['office code']] district = row['dist'].lstrip('0') assert district.isdigit(), "Invalid district found: {}".format( district) name = row['first name'] mid = row['middle initial'].strip() if mid: name += " %s" % mid name += " %s" % row['last name'] suffix = row['suffix'].strip() if suffix: name += " %s" % suffix party = row['party'] if party == 'Democrat': party = 'Democratic' leg = Person(primary_org=chamber, name=name, district=district, party=party) legislator_url = row['URL'].replace('\\', '//').strip() if legislator_url != '': if not legislator_url.startswith('http'): legislator_url = 'http://' leg.add_link(legislator_url) leg.add_party(party=party) office_address = "%s\nRoom %s\nHartford, CT 06106" % ( row['capitol street address'], row['room number']) # extra_office_fields = dict() email = row['email'].strip() if "@" not in email: if not email: email = None elif email.startswith('http://') or email.startswith( 'https://'): # extra_office_fields['contact_form'] = email email = None else: raise ValueError( "Problematic email found: {}".format(email)) leg.add_contact_detail(type='address', value=office_address, note='Capitol Office') leg.add_contact_detail(type='voice', value=row['capitol phone'], note='Capitol Office') if email: leg.add_contact_detail(type='email', value=email) home_address = "{}\n{}, {} {}".format( row['home street address'], row['home city'], row['home state'], row['home zip code'], ) if "Legislative Office Building" not in home_address: leg.add_contact_detail(type='address', value=home_address, note='District Office') if row['home phone'].strip(): leg.add_contact_detail(type='voice', value=row['home phone'], note='District Office') leg.add_source(leg_url) for comm_name in row['committee member1'].split(';'): if ' (' in comm_name: comm_name, role = comm_name.split(' (') role = role.strip(')').lower() else: role = 'member' comm_name = comm_name.strip() if comm_name: if comm_name in committees: com = committees[comm_name] else: com = Organization(comm_name, classification='committee', chamber=chamber) com.add_source(leg_url) committees[comm_name] = com yield com leg.add_membership(name_or_org=com, role=role) yield leg
def scrape_chamber(self, chamber): body = {'lower': 'H', 'upper': 'S'}[chamber] url = 'http://www.azleg.gov/MemberRoster/?body=' + body page = self.get(url).text # there is a bad comment closing tag on this page page = page.replace('--!>', '-->') root = html.fromstring(page) path = '//table//tr' roster = root.xpath(path)[1:] for row in roster: position = '' name, district, party, email, room, phone, = row.xpath('td') if email.attrib.get('class') == 'vacantmember': continue # Skip any vacant members. link = name.xpath('string(a/@href)') if len(name) == 1: name = name.text_content().strip() else: position = name.tail.strip() name = name[0].text_content().strip() if '--' in name: name = name.split('--')[0].strip() linkpage = self.get(link).text linkpage = linkpage.replace('--!>', '-->') linkroot = html.fromstring(linkpage) linkroot.make_links_absolute(link) photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]") if len(photos) != 1: self.warning('no photo on ' + link) photo_url = '' else: photo_url = photos[0].attrib['src'] district = district.text_content().strip() party = party.text_content().strip() email = email.text_content().strip() if email.startswith('Email: '): email = email.replace('Email: ', '').lower() + '@azleg.gov' else: email = '' party = self.get_party(party) room = room.text_content().strip() if chamber == 'lower': address = "House of Representatives\n" else: address = "Senate\n" address = address + "1700 West Washington\n Room " + room \ + "\nPhoenix, AZ 85007" phone = phone.text_content().strip() if '602' not in re.findall(r'(\d+)', phone): phone = "602-" + phone leg = Person(primary_org=chamber, image=photo_url, name=name, district=district, party=party) leg.add_contact_detail(type='address', value=address, note='Capitol Office') leg.add_contact_detail(type='voice', value=phone, note='Capitol Office') leg.add_party(party=party) leg.add_link(link) if email: leg.add_contact_detail(type='email', value=email) if position: leg.add_membership(name_or_org=party, role=position) # leg.add_role(position, term, chamber=chamber, # district=district, party=party) leg.add_source(url) # Probably just get this from the committee scraper # self.scrape_member_page(link, session, chamber, leg) yield leg
def scrape(self): # chambers = [chamber] if chamber is not None else ['upper', 'lower'] leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv" page = self.get(leg_url) committees = {} # Ensure that the spreadsheet's structure hasn't generally changed _row_headers = page.text.split('\r\n')[0].replace('"', '').split(',') assert _row_headers == HEADERS, "Spreadsheet structure may have changed" page = open_csv(page) for row in page: chamber = {'H': 'lower', 'S': 'upper'}[row['office code']] district = row['dist'].lstrip('0') assert district.isdigit(), "Invalid district found: {}".format(district) name = row['first name'] mid = row['middle initial'].strip() if mid: name += " %s" % mid name += " %s" % row['last name'] suffix = row['suffix'].strip() if suffix: name += " %s" % suffix party = row['party'] if party == 'Democrat': party = 'Democratic' leg = Person(primary_org=chamber, name=name, district=district, party=party ) legislator_url = row['URL'].replace('\\', '//').strip() if legislator_url != '': if not legislator_url.startswith('http'): legislator_url = 'http://' leg.add_link(legislator_url) leg.add_party(party=party) office_address = "%s\nRoom %s\nHartford, CT 06106" % ( row['capitol street address'], row['room number']) # extra_office_fields = dict() email = row['email'].strip() if "@" not in email: if not email: email = None elif email.startswith('http://') or email.startswith('https://'): # extra_office_fields['contact_form'] = email email = None else: raise ValueError("Problematic email found: {}".format(email)) leg.add_contact_detail(type='address', value=office_address, note='Capitol Office') leg.add_contact_detail(type='voice', value=row['capitol phone'], note='Capitol Office') if email: leg.add_contact_detail(type='email', value=email) home_address = "{}\n{}, {} {}".format( row['home street address'], row['home city'], row['home state'], row['home zip code'], ) if "Legislative Office Building" not in home_address: leg.add_contact_detail(type='address', value=home_address, note='District Office') if row['home phone'].strip(): leg.add_contact_detail(type='voice', value=row['home phone'], note='District Office') leg.add_source(leg_url) for comm_name in row['committee member1'].split(';'): if ' (' in comm_name: comm_name, role = comm_name.split(' (') role = role.strip(')').lower() else: role = 'member' comm_name = comm_name.strip() if comm_name: if comm_name in committees: com = committees[comm_name] else: com = Organization(comm_name, classification='committee', chamber=chamber) com.add_source(leg_url) committees[comm_name] = com yield com leg.add_membership(name_or_org=com, role=role) yield leg
def scrape(self): web_scraper = LegistarPersonScraper(requests_per_minute = self.requests_per_minute) web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody' if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member, _ in web_scraper.councilMembers(): name = member['Person Name']['label'].strip() web_info[name] = member city_council, = [body for body in self.bodies() if body['BodyName'] == 'City Council'] terms = collections.defaultdict(list) public_advocates = { # Match casing to Bill De Blasio as council member 'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio', 'The Public Advocate (Ms. James)': 'Letitia James', } for office in self.body_offices(city_council): name = office['OfficeRecordFullName'] name = public_advocates.get(name, name).strip() terms[name].append(office) # Add past members (and advocates public) if name not in web_info: web_info[name] = collections.defaultdict(lambda: None) # Check that we have everyone we expect, formatted consistently, in # both information arrays. For instance, this will fail if we forget to # strip trailing spaces from names on one side or the other (which has # the effect of omitting information, such as post, from the scrape). assert set(web_info.keys()) == set(terms.keys()) members = {} for member, offices in terms.items(): p = Person(member) web = web_info[member] for term in offices: role = term['OfficeRecordTitle'] if role == 'Public Advocate': role = 'Non-Voting Council Member' else: role = 'Council Member' district = web.get('District', '').replace(' 0', ' ') p.add_term(role, 'legislature', district=district, start_date=self.toDate(term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) party = web.get('Political Party') if party == 'Democrat': party = 'Democratic' if party: p.add_party(party) if web.get('Photo'): p.image = web['Photo'] contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if web.get(contact_type) and web(contact_type) != 'N/A': p.add_contact_detail(type=type_, value= web[contact_type], note=_note) if web.get('E-mail'): p.add_contact_detail(type="email", value=web['E-mail']['url'], note='E-mail') if web.get('Web site'): p.add_link(web['Web site']['url'], note='web site') if web.get('Notes'): p.extras = {'Notes': web['Notes']} if not p.sources: # Only add sources once source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p committee_types = ['Committee', 'Inactive Committee', 'Select Committee', 'Subcommittee', 'Task Force', 'Land Use', # Committee on Land Use ] body_types = {k: v for k, v in self.body_types().items() if k in committee_types} for body in self.bodies(): if body['BodyTypeName'] in body_types \ or body['BodyName'] in ('Legislative Documents Unit', 'Legal and Government Affairs Division'): # Skip typo in API data if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services': continue parent_org = PARENT_ORGS.get(body['BodyName'], 'New York City Council') body_name = body['BodyName'] o = Organization(body_name, classification='committee', parent_id={'name': parent_org}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source(self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'.format(**body), note='web') for office in self.body_offices(body): # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio', # 'Committee Member', None, 'CHAIRPERSON' role = office['OfficeRecordTitle'] if role and role.lower() == 'chairperson': role = 'Chairperson' else: role = 'Member' person = office['OfficeRecordFullName'] person = public_advocates.get(person, person).strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(o, role=role, start_date=self.toDate(office['OfficeRecordStartDate']), end_date=self.toDate(office['OfficeRecordEndDate'])) yield o for p in members.values(): yield p
def scrape(self): web_scraper = LegistarPersonScraper( requests_per_minute=self.requests_per_minute) web_scraper.MEMBERLIST = 'http://legistar.council.nyc.gov/DepartmentDetail.aspx?ID=6897&GUID=CDC6E691-8A8C-4F25-97CB-86F31EDAB081&Mode=MainBody' if self.cache_storage: web_scraper.cache_storage = self.cache_storage if self.requests_per_minute == 0: web_scraper.cache_write_only = False web_info = {} for member, _ in web_scraper.councilMembers(): name = member['Person Name']['label'].strip() web_info[name] = member city_council, = [ body for body in self.bodies() if body['BodyName'] == 'City Council' ] terms = collections.defaultdict(list) public_advocates = { # Match casing to Bill De Blasio as council member 'The Public Advocate (Mr. de Blasio)': 'Bill De Blasio', 'The Public Advocate (Ms. James)': 'Letitia James', } for office in self.body_offices(city_council): name = office['OfficeRecordFullName'] name = public_advocates.get(name, name).strip() terms[name].append(office) # Add past members (and advocates public) if name not in web_info: web_info[name] = collections.defaultdict(lambda: None) # Check that we have everyone we expect, formatted consistently, in # both information arrays. For instance, this will fail if we forget to # strip trailing spaces from names on one side or the other (which has # the effect of omitting information, such as post, from the scrape). assert set(web_info.keys()) == set(terms.keys()) members = {} for member, offices in terms.items(): p = Person(member) web = web_info[member] for term in offices: role = term['OfficeRecordTitle'] if role == 'Public Advocate': role = 'Non-Voting Council Member' else: role = 'Council Member' district = web.get('District', '').replace(' 0', ' ') p.add_term(role, 'legislature', district=district, start_date=self.toDate( term['OfficeRecordStartDate']), end_date=self.toDate(term['OfficeRecordEndDate'])) party = web.get('Political Party') if party == 'Democrat': party = 'Democratic' if party: p.add_party(party) if web.get('Photo'): p.image = web['Photo'] contact_types = { "City Hall Office": ("address", "City Hall Office"), "City Hall Phone": ("voice", "City Hall Phone"), "Ward Office Phone": ("voice", "Ward Office Phone"), "Ward Office Address": ("address", "Ward Office Address"), "Fax": ("fax", "Fax") } for contact_type, (type_, _note) in contact_types.items(): if web.get(contact_type) and web(contact_type) != 'N/A': p.add_contact_detail(type=type_, value=web[contact_type], note=_note) if web.get('E-mail'): p.add_contact_detail(type="email", value=web['E-mail']['url'], note='E-mail') if web.get('Web site'): p.add_link(web['Web site']['url'], note='web site') if web.get('Notes'): p.extras = {'Notes': web['Notes']} if not p.sources: # Only add sources once source_urls = self.person_sources_from_office(term) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[member] = p committee_types = [ 'Committee', 'Inactive Committee', 'Select Committee', 'Subcommittee', 'Task Force', 'Land Use' ] # Committee on Land Use body_types = { k: v for k, v in self.body_types().items() if k in committee_types } for body in self.bodies(): if body['BodyTypeName'] in body_types \ or body['BodyName'] in ('Legislative Documents Unit', 'Legal and Government Affairs Division'): # Skip typo in API data if body['BodyName'] == 'Committee on Mental Health, Developmental Disability, Alcoholism, Substance Abuse amd Disability Services': continue parent_org = PARENT_ORGS.get(body['BodyName'], 'New York City Council') body_name = body['BodyName'] o = Organization(body_name, classification='committee', parent_id={'name': parent_org}) o.add_source(self.BASE_URL + '/bodies/{BodyId}'.format(**body), note='api') o.add_source( self.WEB_URL + '/DepartmentDetail.aspx?ID={BodyId}&GUID={BodyGuid}'. format(**body), note='web') for office in self.body_offices(body): # Possible roles: 'Council Member', 'MEMBER', 'Ex-Officio', # 'Committee Member', None, 'CHAIRPERSON' role = office['OfficeRecordTitle'] if role and role.lower() == 'chairperson': role = 'Chairperson' else: role = 'Member' person = office['OfficeRecordFullName'] person = public_advocates.get(person, person).strip() if person in members: p = members[person] else: p = Person(person) source_urls = self.person_sources_from_office(office) person_api_url, person_web_url = source_urls p.add_source(person_api_url, note='api') p.add_source(person_web_url, note='web') members[person] = p p.add_membership(o, role=role, start_date=self.toDate( office['OfficeRecordStartDate']), end_date=self.toDate( office['OfficeRecordEndDate'])) yield o for p in members.values(): yield p
def scrape(self): noncommittees = {'Committee of the Whole'} committee_d = {} people_d = {} for councilman, committees in self.councilMembers() : if 'url' in councilman['Person Name'] : councilman_url = councilman['Person Name']['url'] if councilman_url in people_d : people_d[councilman_url][0].append(councilman) else : people_d[councilman_url] = [councilman], committees for person_entries, committees in people_d.values() : councilman = person_entries[-1] p = Person(councilman['Person Name']['label']) if p.name == 'Letitia James' : p.name = 'Letitia Ms. James' p.add_name('Letitia James') spans = [(self.toTime(entry['Start Date']).date(), self.toTime(entry['End Date']).date(), entry['District']) for entry in person_entries] merged_spans = [] last_end_date = None last_district = None for start_date, end_date, district in sorted(spans) : if last_end_date is None : span = [start_date, end_date, district] elif (start_date - last_end_date) == datetime.timedelta(1) and district == last_district : span[1] = end_date else : merged_spans.append(span) span = [start_date, end_date, district] last_end_date = end_date last_district = district merged_spans.append(span) for start_date, end_date, district in merged_spans : district = councilman['District'].replace(' 0', ' ') if end_date == datetime.date(2017, 12, 31) : end_date = '' else : end_date = end_date.isoformat() print(start_date, end_date) p.add_term('Council Member', 'legislature', district=district, start_date=start_date.isoformat(), end_date=end_date) party = councilman['Political Party'] if party == 'Democrat' : party = 'Democratic' if party : p.add_party(party) if councilman['Photo'] : p.image = councilman['Photo'] if councilman["E-mail"]: p.add_contact_detail(type="email", value=councilman['E-mail']['url'], note='E-mail') if councilman['Web site']: p.add_link(councilman['Web site']['url'], note='web site') p.extras = {'Notes' : councilman['Notes']} p.add_source(councilman['Person Name']['url'], note='web') for committee, _, _ in committees: committee_name = committee['Department Name']['label'] if committee_name not in noncommittees and 'committee' in committee_name.lower(): o = committee_d.get(committee_name, None) if o is None: parent_id = PARENT_ORGS.get(committee_name, 'New York City Council') o = Organization(committee_name, classification='committee', parent_id={'name' : parent_id}) o.add_source(committee['Department Name']['url']) committee_d[committee_name] = o membership = o.add_member(p, role=committee["Title"]) membership.start_date = self.mdY2Ymd(committee["Start Date"]) yield p for o in committee_d.values() : if 'Committee' in o.name : yield o for o in committee_d.values() : if 'Subcommittee' in o.name : yield o o = Organization('Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services', classification='committee', parent_id={'name' : 'New York City Council'}) o.add_source("http://legistar.council.nyc.gov/Departments.aspx") yield o o = Organization('Subcommittee on Drug Abuse', classification='committee', parent_id={'name' : 'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services'}) o.add_source("http://legistar.council.nyc.gov/Departments.aspx") yield o
def scrape_chamber(self, chamber): body = {'lower': 'H', 'upper': 'S'}[chamber] url = 'http://www.azleg.gov/MemberRoster/?body=' + body page = self.get(url).text # there is a bad comment closing tag on this page page = page.replace('--!>', '-->') root = html.fromstring(page) path = '//table//tr' roster = root.xpath(path)[1:] for row in roster: position = '' name, district, party, email, room, phone, = row.xpath('td') if email.attrib.get('class') == 'vacantmember': continue # Skip any vacant members. link = name.xpath('string(a/@href)') if len(name) == 1: name = name.text_content().strip() else: position = name.tail.strip() name = name[0].text_content().strip() if '--' in name: name = name.split('--')[0].strip() linkpage = self.get(link).text linkpage = linkpage.replace('--!>', '-->') linkroot = html.fromstring(linkpage) linkroot.make_links_absolute(link) photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]") if len(photos) != 1: self.warning('no photo on ' + link) photo_url = '' else: photo_url = photos[0].attrib['src'] district = district.text_content() party = party.text_content().strip() email = email.text_content().strip() if email.startswith('Email: '): email = email.replace('Email: ', '').lower() + '@azleg.gov' else: email = '' party = self.get_party(party) room = room.text_content().strip() if chamber == 'lower': address = "House of Representatives\n" else: address = "Senate\n" address = address + "1700 West Washington\n Room " + room \ + "\nPhoenix, AZ 85007" phone = phone.text_content().strip() if '602' not in re.findall(r'(\d+)', phone): phone = "602-" + phone leg = Person(primary_org=chamber, image=photo_url, name=name, district=district, party=party) leg.add_contact_detail(type='address', value=address, note='Capitol Office') leg.add_contact_detail(type='voice', value=phone, note='Capitol Office') leg.add_party(party=party) leg.add_link(link) if email: leg.add_contact_detail(type='email', value=email) if position: leg.add_membership(name_or_org=party, role=position) # leg.add_role(position, term, chamber=chamber, # district=district, party=party) leg.add_source(url) # Probably just get this from the committee scraper # self.scrape_member_page(link, session, chamber, leg) yield leg
def scrape_chamber(self, chamber): body = {"lower": "H", "upper": "S"}[chamber] url = "http://www.azleg.gov/MemberRoster/?body=" + body page = self.get(url).text # there is a bad comment closing tag on this page page = page.replace("--!>", "-->") root = html.fromstring(page) path = "//table//tr" roster = root.xpath(path)[1:] for row in roster: position = "" name, district, party, email, room, phone, = row.xpath("td") if email.attrib.get("class") == "vacantmember": continue # Skip any vacant members. link = name.xpath("string(a/@href)") if len(name) == 1: name = name.text_content().strip() else: position = name.tail.strip() name = name[0].text_content().strip() if "--" in name: name = name.split("--")[0].strip() linkpage = self.get(link).text linkpage = linkpage.replace("--!>", "-->") linkroot = html.fromstring(linkpage) linkroot.make_links_absolute(link) photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]") if len(photos) != 1: self.warning("no photo on " + link) photo_url = "" else: photo_url = photos[0].attrib["src"] district = district.text_content().strip() party = party.text_content().strip() email = email.text_content().strip() if email.startswith("Email: "): email = email.replace("Email: ", "").lower() + "@azleg.gov" else: email = "" party = self.get_party(party) room = room.text_content().strip() if chamber == "lower": address = "House of Representatives\n" else: address = "Senate\n" address = (address + "1700 West Washington\n Room " + room + "\nPhoenix, AZ 85007") phone = phone.text_content().strip() if "602" not in re.findall(r"(\d+)", phone): phone = "602-" + phone leg = Person( primary_org=chamber, image=photo_url, name=name, district=district, party=party, ) leg.add_contact_detail(type="address", value=address, note="Capitol Office") leg.add_contact_detail(type="voice", value=phone, note="Capitol Office") leg.add_party(party=party) leg.add_link(link) if email: leg.add_contact_detail(type="email", value=email) if position: leg.add_membership(name_or_org=party, role=position) # leg.add_role(position, term, chamber=chamber, # district=district, party=party) leg.add_source(url) # Probably just get this from the committee scraper # self.scrape_member_page(link, session, chamber, leg) yield leg
def test_person_add_party(): p = Person('Groot') p.add_party('Green') p._related[0].validate() assert get_pseudo_id(p._related[0].organization_id) == { 'name': 'Green', 'classification': 'party'}