def test_person_add_party(): p = Person("Groot") p.add_party("Green") p._related[0].validate() assert get_pseudo_id(p._related[0].organization_id) == { "name": "Green", "classification": "party", }
def scrape(self): # chambers = [chamber] if chamber is not None else ['upper', 'lower'] leg_url = "ftp://ftp.cga.ct.gov/pub/data/LegislatorDatabase.csv" page = self.get(leg_url) committees = {} # Ensure that the spreadsheet's structure hasn't generally changed _row_headers = page.text.split("\r\n")[0].replace('"', "").split(",") assert _row_headers == HEADERS, "Spreadsheet structure may have changed" page = open_csv(page) for row in page: chamber = {"H": "lower", "S": "upper"}[row["office code"]] district = row["dist"].lstrip("0") assert district.isdigit(), "Invalid district found: {}".format( district) name = row["first name"] mid = row["middle initial"].strip() if mid: name += " %s" % mid name += " %s" % row["last name"] suffix = row["suffix"].strip() if suffix: name += " %s" % suffix party = row["party"] if party == "Democrat": party = "Democratic" leg = Person(primary_org=chamber, name=name, district=district, party=party) legislator_url = row["URL"].replace("\\", "//").strip() if legislator_url != "": if not legislator_url.startswith("http"): legislator_url = "http://" leg.add_link(legislator_url) leg.add_party(party=party) office_address = "%s\nRoom %s\nHartford, CT 06106" % ( row["capitol street address"], row["room number"], ) # extra_office_fields = dict() email = row["email"].strip() if "@" not in email: if not email: email = None elif email.startswith("http://") or email.startswith( "https://"): # extra_office_fields['contact_form'] = email email = None else: raise ValueError( "Problematic email found: {}".format(email)) leg.add_contact_detail(type="address", value=office_address, note="Capitol Office") leg.add_contact_detail(type="voice", value=row["capitol phone"], note="Capitol Office") if email: leg.add_contact_detail(type="email", value=email) home_address = "{}\n{}, {} {}".format( row["home street address"], row["home city"], row["home state"], row["home zip code"], ) if "Legislative Office Building" not in home_address: leg.add_contact_detail(type="address", value=home_address, note="District Office") if row["home phone"].strip(): leg.add_contact_detail(type="voice", value=row["home phone"], note="District Office") leg.add_source(leg_url) for comm_name in row["committee member1"].split(";"): if " (" in comm_name: comm_name, role = comm_name.split(" (") role = role.strip(")").lower() else: role = "member" comm_name = comm_name.strip() if comm_name: if comm_name in committees: com = committees[comm_name] else: com = Organization(comm_name, classification="committee", chamber=chamber) com.add_source(leg_url) committees[comm_name] = com yield com leg.add_membership(name_or_org=com, role=role) yield leg
def scrape_chamber(self, chamber): body = {"lower": "H", "upper": "S"}[chamber] url = "http://www.azleg.gov/MemberRoster/?body=" + body page = self.get(url).text # there is a bad comment closing tag on this page page = page.replace("--!>", "-->") root = html.fromstring(page) path = "//table//tr" roster = root.xpath(path)[1:] for row in roster: position = "" name, district, party, email, room, phone, = row.xpath("td") if email.attrib.get("class") == "vacantmember": continue # Skip any vacant members. link = name.xpath("string(a/@href)") if len(name) == 1: name = name.text_content().strip() else: position = name.tail.strip() name = name[0].text_content().strip() if "--" in name: name = name.split("--")[0].strip() linkpage = self.get(link).text linkpage = linkpage.replace("--!>", "-->") linkroot = html.fromstring(linkpage) linkroot.make_links_absolute(link) photos = linkroot.xpath("//img[contains(@src, 'MemberPhoto')]") if len(photos) != 1: self.warning("no photo on " + link) photo_url = "" else: photo_url = photos[0].attrib["src"] district = district.text_content().strip() party = party.text_content().strip() email = email.text_content().strip() if email.startswith("Email: "): email = email.replace("Email: ", "").lower() + "@azleg.gov" else: email = "" party = self.get_party(party) room = room.text_content().strip() if chamber == "lower": address = "House of Representatives\n" else: address = "Senate\n" address = ( address + "1700 West Washington\n Room " + room + "\nPhoenix, AZ 85007" ) phone = phone.text_content().strip() if "602" not in re.findall(r"(\d+)", phone): phone = "602-" + phone leg = Person( primary_org=chamber, image=photo_url, name=name, district=district, party=party, ) leg.add_contact_detail(type="address", value=address, note="Capitol Office") leg.add_contact_detail(type="voice", value=phone, note="Capitol Office") leg.add_party(party=party) leg.add_link(link) if email: leg.add_contact_detail(type="email", value=email) if position: leg.add_membership(name_or_org=party, role=position) # leg.add_role(position, term, chamber=chamber, # district=district, party=party) leg.add_source(url) # Probably just get this from the committee scraper # self.scrape_member_page(link, session, chamber, leg) yield leg