def scrape_interim_committee(self, link, name): url = re.sub(r"\s+", "", link.attrib["href"]) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) if "Subcommittee" in name: # Check whether the parent committee is manually defined first # before attempting to automatically resolve it. parent = WVCommitteeScraper.subcommittee_parent_map.get(name, None) if parent is None: parent = name.partition("Subcommittee")[0].strip() comm = Organization( name=name, classification="committee", parent_id=self._joint_committees[parent], ) else: comm = Organization(name=name, classification="committee", chamber="legislature") self._joint_committees[name] = comm comm.add_source(url) xpath = '//a[contains(@href, "?member=")]' for link in doc.xpath(xpath): name = link.text_content().strip() name = re.sub(r"^Delegate\s+", "", name) name = re.sub(r"^Senator\s+", "", name) role = link.getnext().text or "member" comm.add_member(name, role.strip()) return comm
def scrape_reps_comm(self): # As of 1/27/15, the committee page has the wrong # session number (126th) at the top, but # has newly elected people, so we're rolling with it. url = "http://legislature.maine.gov/house/hsecoms.htm" page = self.get(url).text root = lxml.html.fromstring(page) count = 0 for n in range(1, 12, 2): path = "string(//body/center[%s]/h1/a)" % (n) comm_name = root.xpath(path) committee = Organization(chamber="lower", name=comm_name, classification="committee") count = count + 1 path2 = "/html/body/ul[%s]/li/a" % (count) for el in root.xpath(path2): rep = el.text if rep.find("(") != -1: mark = rep.find("(") rep = rep[15:mark].strip() if "chair" in rep.lower(): role = "chair" rep = re.sub(r"(?i)[\s,]*chair\s*$", "", rep).strip() else: role = "member" committee.add_member(rep, role) committee.add_source(url) yield committee
def _scrape_lower_special_committees(self): url = "http://house.louisiana.gov/H_Cmtes/SpecialCommittees.aspx" page = self.lxmlize(url) committee_list = page.xpath('//div[@class="accordion"]')[0] headers = committee_list.xpath("./h3") for header in headers: committee_name_text = header.xpath("string()") committee_name = committee_name_text.strip() committee_name = self._normalize_committee_name(committee_name) chamber = "legislature" if committee_name.startswith("Joint") else "lower" committee = Organization( committee_name, chamber=chamber, classification="committee" ) committee.add_source(url) committee_memberlist = header.xpath( './following-sibling::div[@class="pane"]' '//tr[@class="linkStyle2"]' ) for row in committee_memberlist: member_name = row.xpath("normalize-space(string(./th[1]))") member_name = self._normalize_member_name(member_name) member_role = row.xpath("normalize-space(string(./th[2]))") member_role = self._normalize_member_role(member_role) committee.add_member(member_name, member_role) yield committee
def scrape_chamber(self, chamber): session = self.latest_session() # since we are scraping only latest_session session_id = session_metadata.session_id_meta_data[session] client = AZClient() committees = client.list_committees( sessionId=session_id, includeOnlyCommitteesWithAgendas="false", legislativeBody="S" if chamber == "upper" else "H", ) for committee in committees.json(): c = Organization( name=committee["CommitteeName"], chamber=chamber, classification="committee", ) details = client.get_standing_committee( sessionId=session_id, legislativeBody="S" if chamber == "upper" else "H", committeeId=committee["CommitteeId"], includeMembers="true", ) for member in details.json()[0]["Members"]: c.add_member( u"{} {}".format(member["FirstName"], member["LastName"]), role=parse_role(member), ) c.add_source(details.url) c.add_source(committees.url) yield c
def scrape_page(self, link, chamber=None): page = self.lxmlize(link.attrib["href"]) comName = link.text roles = { "Chair": "chair", "Vice Chair": "vice-chair", "Vice-Chair": "vice-chair", } committee = Organization(comName, chamber=chamber, classification="committee") committee.add_source(link.attrib["href"]) for member in page.xpath('//div[@class="members"]/' + 'div[@class="roster-item"]'): details = member.xpath('.//div[@class="member-details"]')[0] person = details.xpath("./h4")[0].text_content() # This page does random weird things with whitespace to names person = " ".join(person.strip().split()) if not person: continue role = details.xpath('./span[@class="member-role"]') if role: role = roles[role[0].text] else: role = "member" committee.add_member(person, role=role) yield committee
def scrape_lower_committee(self, name, url): page = self.lxmlize(url) committee = Organization(chamber="lower", name=name, classification="committee") committee.add_source(url) seen = set() member_links = self.get_nodes( page, '//div[@class="mod-inner"]//a[contains(@href, "mem")]') for member_link in member_links: member_name = None member_role = None member_name = member_link.text if member_name is None: continue # Figure out if this person is the chair. if member_link == member_links[0]: member_role = "chair" else: member_role = "member" if name not in seen: committee.add_member(member_name, member_role) seen.add(member_name) return committee
def _scrape_upper_committee(self, name, url2): cat = "Assignments.asp" url3 = url2.replace("default.asp", cat) committee = Organization(name, chamber="upper", classification="committee") committee.add_source(url2) page = self.lxmlize(url3) members = page.xpath('//table[@id="table38"]//font/a/b') for link in members: role = "member" if link == members[0]: role = "Chairman" if link == members[1]: role = "Vice-Chairman" name = link.xpath("string()") name = name.replace("Senator ", "") name = re.sub(r"[\s]{2,}", " ", name).strip() committee.add_member(name, role) yield committee
def scrape_senate_comm(self): url = ("http://legislature.maine.gov/committee-information/" "standing-committees-of-the-senate") html = self.get(url).text doc = lxml.html.fromstring(html) headings = doc.xpath("//p/strong") for heading in headings: committee = Organization( chamber="upper", name=heading.text.strip(":"), classification="committee", ) committee.add_source(url) par = heading.getparent().getnext() while True: link = par.xpath("a") if len(link) == 0: break res = self.senate_committee_pattern.search(link[0].text) name, chair = res.groups() committee.add_member( name, "chair" if chair is not None else "member") par = par.getnext() yield committee
def scrape_committee(self, name, url, chamber): org = Organization(name=name, chamber=chamber, classification="committee") org.add_source(url) data = self.get(url).text doc = lxml.html.fromstring(data) for leg in doc.xpath( '//div[@id="members"]/div[@id="members"]/p/a/text()'): leg = leg.replace("Representative ", "") leg = leg.replace("Senator ", "") leg = leg.strip() if " (" in leg: leg, role = leg.split(" (") if "Vice-Chair" in role: role = "vice-chair" elif "Co-Chair" in role: role = "co-chair" elif "Chair" in role: role = "chair" else: raise Exception("unknown role: %s" % role) else: role = "member" org.add_member(leg, role) return org
def scrape_senate_committee(self, url): html = self.get(url).text doc = lxml.html.fromstring(html) headers = doc.xpath('(//div[@class="row"])[2]//h1') assert len(headers) == 1 name = " ".join(headers[0].xpath("./text()")) name = re.sub(r"\s+Committee.*$", "", name) com = Organization(chamber="upper", name=name, classification="committee") for member in doc.xpath('(//div[@class="row"])[3]/div[1]/ul[1]/li'): text = member.text_content() member_name = member.xpath("./a/text()")[0].replace( "Representative ", "") if "Committee Chair" in text: role = "chair" elif "Minority Vice" in text: role = "minority vice chair" elif "Vice" in text: role = "majority vice chair" else: role = "member" com.add_member(member_name, role=role) com.add_source(url) if com.name == "Appropriations": self._senate_appropriations = com yield com
def scrape_approp_subcommittees(self): URL = "http://www.senate.michigan.gov/committee/appropssubcommittee.html" html = self.get(URL).text doc = lxml.html.fromstring(html) for strong in doc.xpath("//strong"): com = Organization( name=strong.text.strip(), parent_id=self._senate_appropriations, classification="committee", ) com.add_source(URL) legislators = strong.getnext().tail.replace("Senators", "").strip() for leg in re.split(", | and ", legislators): if leg.endswith("(C)"): role = "chairman" leg = leg[:-4] elif leg.endswith("(VC)"): role = "vice chairman" leg = leg[:-5] elif leg.endswith("(MVC)"): role = "minority vice chairman" leg = leg[:-6] else: role = "member" com.add_member(leg, role=role) yield com
def test_committee_add_member_person(): c = Organization("Defense", classification="committee") p = Person("John Adams") c.add_member(p, role="chairman") assert c._related[0].person_id == p._id assert c._related[0].organization_id == c._id assert c._related[0].role == "chairman"
def handle_page(self): name = self.doc.xpath('//h2[@class="committeeName"]')[0].text if name.startswith("Appropriations Subcommittee"): return # TODO: restore scraping of Appropriations Subcommittees # name = name.replace('Appropriations ', '') # parent = {'name': 'Appropriations', 'classification': 'upper'} # chamber = None else: if name.startswith("Committee on"): name = name.replace("Committee on ", "") parent = None chamber = "upper" comm = Organization(name=name, classification="committee", chamber=chamber, parent_id=parent) for dt in self.doc.xpath('//div[@id="members"]/dl/dt'): role = dt.text.replace(": ", "").strip().lower() member = dt.xpath("./following-sibling::dd")[0].text_content() member = self.clean_name(member) comm.add_member(member, role=role) for ul in self.doc.xpath('//div[@id="members"]/ul/li'): member = self.clean_name(ul.text_content()) comm.add_member(member) comm.add_source(self.url) yield comm
def scrape(self, session=None): if session is None: session = self.latest_session() self.info("no session specified, using %s", session) # com_types = ['J', 'SE', 'O'] # base_url = 'https://wyoleg.gov/LsoService/api/committeeList/2018/J' url = "https://wyoleg.gov/LsoService/api/committees/{}".format(session) response = self.get(url) coms_json = json.loads(response.content.decode("utf-8")) for row in coms_json: com_url = "https://wyoleg.gov/LsoService/api/committeeDetail/{}/{}".format( session, row["ownerID"]) com_response = self.get(com_url) com = json.loads(com_response.content.decode("utf-8")) # WY doesn't seem to have any house/senate only committees that I can find committee = Organization(name=com["commName"], chamber="legislature", classification="committee") for member in com["commMembers"]: role = "chairman" if member[ "chairman"] == "Chairman" else "member" committee.add_member(member["name"], role) # some WY committees have non-legislators appointed to the member by the Governor # but the formatting is super inconsistent if com["otherMembers"]: committee.extras["other_members"] = com["otherMembers"] committee.extras["wy_id"] = com["commID"] committee.extras["wy_code"] = com["ownerID"] committee.extras["wy_type_code"] = com["type"] committee.extras["budget"] = com["budget"] if com["statAuthority"]: committee.extras["statutory_authority"] = com["statAuthority"] if com["number"]: committee.extras["seat_distribution"] = com["number"] committee.add_identifier(scheme="WY Committee ID", identifier=str(com["commID"])) committee.add_identifier(scheme="WY Committee Code", identifier=str(com["ownerID"])) if com["description"]: committee.add_identifier(scheme="Common Name", identifier=com["description"]) source_url = "http://wyoleg.gov/Committees/{}/{}".format( session, com["ownerID"]) committee.add_source(source_url) yield committee
def _scrape_standing_committees(self): """Scrapes the Standing Committees page of the Nebraska state legislature.""" main_url = ( "http://www.nebraskalegislature.gov/committees/standing-committees.php" ) page = self.lxmlize(main_url) committee_nodes = self.get_nodes( page, '//a[@class="accordion-switch"][contains(text(), "Standing Committees")]' '/ancestor::div[@class="panel panel-leg"]//div[@class="list-group"]' '/a[@class="list-group-item"]', ) for committee_node in committee_nodes: committee_page_url = committee_node.attrib["href"] committee_page = self.lxmlize(committee_page_url) name_text = self.get_node( committee_page, '//div[@class="container view-front"]/div[@class="row"]/' 'div[@class="col-sm-6 col-md-7"]/h1/text()[normalize-space()]', ) name = name_text.split()[0:-1] committee_name = "" for x in range(len(name)): committee_name += name[x] + " " committee_name = committee_name[0:-1] org = Organization(name=committee_name, chamber="legislature", classification="committee") members = self.get_nodes( committee_page, '//div[@class="col-sm-4 col-md-3 ltc-col-right"][1]/' 'div[@class="block-box"][1]/ul[@class="list-unstyled ' 'feature-content"]/li/a/text()[normalize-space()]', ) for member in members: member_name = re.sub(r"\Sen\.\s+", "", member) member_name = re.sub(r", Chairperson", "", member_name).strip() if "Chairperson" in member: member_role = "Chairperson" else: member_role = "member" org.add_member(member_name, member_role) org.add_source(main_url) org.add_source(committee_page_url) yield org
def scrape(self): com_url = "http://dccouncil.us/committees" data = self.get(com_url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(com_url) comms = set(doc.xpath('//a[contains(@href, "dccouncil.us/committees/")]')) for committee in comms: url = committee.attrib["href"] name = committee.text_content().strip() comm_data = self.get(url).text comm_page = lxml.html.fromstring(comm_data) comm_page.make_links_absolute(url) # classify these as belonging to the legislature committee = Organization( name=name, classification="committee", chamber="legislature" ) if comm_page.xpath('//p[@class="page-summary"]'): summary = ( comm_page.xpath('//p[@class="page-summary"]')[0] .text_content() .strip() ) committee.extras["summary"] = summary chair = comm_page.xpath("//h4[text()='Chairperson']/following-sibling::p") chair_name = chair[0].text_content().strip() chair_name = self.remove_title(chair_name) committee.add_member(chair_name, role="chair") members = comm_page.xpath( "//h4[text()='Councilmembers']/following-sibling::ul" ) members = members[0].xpath("./li") for m in members: mem_name = m.text_content().strip() mem_name = self.remove_title(mem_name) if mem_name != chair_name: committee.add_member(mem_name) committee.add_source(url) committee.add_link(url, note="Official Website") if not committee._related: self.warning("empty committee: %s;", name) else: yield committee
def scrape(self, session=None): year_abr = ((int(session) - 209) * 2) + 2000 self._init_mdb(year_abr) members_csv = self.access_to_csv("COMember") info_csv = self.access_to_csv("Committee") org_dictionary = {} # Committee Info Database for rec in info_csv: abrv = rec["Code"] comm_name = rec["Description"] if abrv[0] == "A": chamber = "lower" elif abrv[0] == "S": chamber = "upper" org = Organization(name=comm_name, chamber=chamber, classification="committee") org.add_source("http://www.njleg.state.nj.us/downloads.asp") org_dictionary[abrv] = org # Committee Member Database # FIXME: E.g. "SCEB" is the Select Commission on Emergency COVID-19 Borrowing. # https://www.njleg.state.nj.us/committees/sceb.asp # Its members have "O" for their position code. What does that mean? They're # only called members on the web page, so I'll go with that. POSITIONS = { "C": "chair", "V": "vice-chair", "": "member", "O": "member", } for member_rec in members_csv: # assignment=P means they are active, assignment=R means removed if member_rec["Assignment_to_Committee"] == "P": abr = member_rec["Code"] org = org_dictionary[abr] leg = member_rec["Member"] role = POSITIONS[member_rec["Position_on_Committee"]] leg = " ".join(leg.split(", ")[::-1]) org.add_member(leg, role=role) for org in org_dictionary.values(): yield org
def _scrape_select_special_committees(self): """Scrapes the Select and Special Committees page of the Nebraska state legislature.""" main_url = "http://www.nebraskalegislature.gov/committees/select-committees.php" page = self.lxmlize(main_url) committee_nodes = self.get_nodes( page, '//a[contains(@class, "accordion-switch")]' '/ancestor::div[@class="panel panel-leg"]', ) for committee_node in committee_nodes: committee_name = self.get_node( committee_node, './/h2[@class="panel-title"]/text()[normalize-space()]') if committee_name is None: committee_name = self.get_node( committee_node, './/h2[@class="panel-title"]/a/text()[normalize-space()]', ) org = Organization(name=committee_name, chamber="legislature", classification="committee") org.add_source(main_url) members = self.get_nodes( committee_node, './/a[@class="list-group-item"]' "/text()[normalize-space()]", ) for member in members: member_name = re.sub(r"\Sen\.\s+", "", member) member_name = re.sub(r", Chairperson", "", member_name).strip() if "Chairperson" in member: member_role = "Chairperson" else: member_role = "member" org.add_member(member_name, member_role) if not org._related: self.warning("No members found in {} committee.".format( org.name)) else: yield org
def scrape_committee(self, chamber, url): html = self.get(url).text doc = lxml.html.fromstring(html) name = doc.xpath("//title/text()")[0] com = Organization(name, chamber=chamber, classification="committee") com.add_source(url) members = doc.xpath('//a[contains(@href, "/Legislators/Profile")]') for member in members: title = member.xpath("../span") role = title[0].text.lower() if title else "member" com.add_member(member.text, role) if members: return com
def _scrape_committee(self, committee_name, link, chamber): """Scrape individual committee page and add members""" page = self.get(link).text page = lxml.html.fromstring(page) page.make_links_absolute(link) is_subcommittee = bool(page.xpath('//li/a[text()="Committee"]')) if is_subcommittee: # All TN subcommittees are just the name of the parent committee with " Subcommittee" # at the end parent_committee_name = re.sub(r"\s*(Study )?Subcommittee\s*", "", committee_name) com = Organization( committee_name, classification="committee", parent_id=self.parents[parent_committee_name], ) else: com = Organization(committee_name, chamber=chamber, classification="committee") self.parents[committee_name] = com._id OFFICER_SEARCH = ('//h2[contains(text(), "Committee Officers")]/' "following-sibling::div/ul/li/a") MEMBER_SEARCH = ('//h2[contains(text(), "Committee Members")]/' "following-sibling::div/ul/li/a") for a in page.xpath(OFFICER_SEARCH) + page.xpath(MEMBER_SEARCH): member_name = " ".join([ x.strip() for x in a.xpath("text()") + a.xpath("span/text()") if x.strip() ]) role = a.xpath("small") if role: role = role[0].xpath("text()")[0].strip() else: role = "member" if "(Vacant)" in role: continue com.add_member(member_name, role) com.add_link(link) com.add_source(link) return com
def scrape_house_committees(self): url = "http://www.house.leg.state.mn.us/comm/commemlist.asp" html = self.get(url).text doc = lxml.html.fromstring(html) for com in doc.xpath('//h2[@class="commhighlight"]'): members_url = com.xpath( 'following-sibling::p[1]/a[text()="Members"]/@href')[0] com = Organization(com.text, chamber="lower", classification="committee") com.add_source(members_url) try: member_html = self.get(members_url).text mdoc = lxml.html.fromstring(member_html) except HTTPError: self.warning( "Member list for {} failed to respond; skipping".format( com.name)) continue # each legislator in their own table # first row, second column contains all the info for ltable in mdoc.xpath("//table/tr[1]/td[2]/p/b[1]"): # name is tail string of last element name = ltable.text_content() text = ltable.text if text and name != text: name = name.replace(text, "") # role is inside a nested b tag role = ltable.xpath("b/*/text()") if role: # if there was a role, remove it from name role = role[0] name = name.replace(role, "") else: role = "member" name = name.split(" (")[0] com.add_member(name.strip(), role) # save yield com
def scrape_upper_committee(self, url): doc = self.lxmlize(url) inner_content = self.get_node(doc, '//section[@class="inner-content"]') comm_name = self.get_node(inner_content, ".//h2").text.strip() # Remove "Committee" from committee names comm_name = (comm_name.replace("Comisión de ", "").replace( "Comisión sobre ", "").replace("Comisión para ", "").replace("Comisión Especial para el Estudio de ", "").replace("Comisión Especial para ", "").replace("Comisión ", "")) comm_name = re.sub(r"(?u)^(las?|el|los)\s", "", comm_name) comm_name = comm_name[0].upper() + comm_name[1:] comm = Organization(comm_name, chamber="upper", classification="committee") comm.add_source(url) members = self.get_nodes(inner_content, ".//li") for member in members: name_parts = member.text.split("-") name = name_parts[0].replace("Hon. ", "").strip() if len(name_parts) > 1: title = name_parts[1].strip() # Translate titles to English for parity with other states if "President" in title: title = "chairman" elif title.startswith("Vicepresident"): title = "vicechairman" elif title.startswith("Secretari"): title = "secretary" else: raise AssertionError( "Unknown member type: {}".format(title)) comm.add_member(name, title) else: comm.add_member(name) yield comm
def scrape_house_committees(self): base_url = "http://house.mi.gov/MHRPublic/CommitteeInfo.aspx?comkey=" html = self.get("http://house.mi.gov/mhrpublic/committee.aspx").text doc = lxml.html.fromstring(html) # get values out of drop down for opt in doc.xpath("//option"): name = opt.text # skip invalid choice if opt.text in ("Statutory Committees", "Select One"): continue if "have not been created" in opt.text: self.warning("no committees yet for the house") return com_url = base_url + opt.get("value") com_html = self.get(com_url).text cdoc = lxml.html.fromstring(com_html) com = Organization(chamber="lower", name=name, classification="committee") com.add_source(com_url) for a in doc.xpath('//a[starts-with(@id, "memberLink")]'): name = a.text.strip() # all links to http:// pages in servicecolumn2 are legislators members = cdoc.xpath('//div[contains(@id,"memberPanelRow")]') for mem in members: name = mem.xpath("./a") if name: name = name[0].text.strip() else: # this is a blank row continue text = mem.xpath("./span")[0].text if "Committee Chair" in text: role = "chair" elif "Vice-Chair" in text: role = "vice chair" else: role = "member" com.add_member(name, role=role) yield com
def scrape(self, session=None): if not session: session = self.jurisdiction.legislative_sessions[-1]["name"] self.info("no session specified, using %s", session) year_abr = session[0:4] self._init_mdb(year_abr) members_csv = self.access_to_csv("COMember") info_csv = self.access_to_csv("Committee") org_dictionary = {} # Committee Info Database for rec in info_csv: abrv = rec["Code"] comm_name = rec["Description"] if abrv[0] == "A": chamber = "lower" elif abrv[0] == "S": chamber = "upper" org = Organization(name=comm_name, chamber=chamber, classification="committee") org.add_source("http://www.njleg.state.nj.us/downloads.asp") org_dictionary[abrv] = org # Committee Member Database POSITIONS = {"C": "chair", "V": "vice-chair", "": "member"} for member_rec in members_csv: # assignment=P means they are active, assignment=R means removed if member_rec["Assignment_to_Committee"] == "P": abr = member_rec["Code"] org = org_dictionary[abr] leg = member_rec["Member"] role = POSITIONS[member_rec["Position_on_Committee"]] leg = " ".join(leg.split(", ")[::-1]) org.add_member(leg, role=role) for org in org_dictionary.values(): yield org
def scrape_senate_committee(self, url): html = self.get(url).text doc = lxml.html.fromstring(html) com_name = doc.xpath('//a[contains(@href, "committee_bio")]/text()')[0] parent = doc.xpath('//h4//a[contains(@href, "committee_bio")]/text()') if parent: self.log("%s is subcommittee of %s", com_name, parent[0]) com = Organization( com_name, chamber="upper", classification="committee", parent_id={ "name": parent[0], "classification": "upper" }, ) else: com = Organization(com_name, chamber="upper", classification="committee") for link in doc.xpath( '//div[@id="members"]//a[contains(@href, "member_bio")]'): name = link.text_content().strip() if name: position = link.xpath(".//preceding-sibling::b/text()") if not position: position = "member" elif position[0] == "Chair:": position = "chair" elif position[0] == "Vice Chair:": position = "vice chair" elif position[0] == "Ranking Minority Member:": position = "ranking minority member" else: raise ValueError("unknown position: %s" % position[0]) name = name.split(" (")[0] com.add_member(name.strip(), position) com.add_source(url) yield com
def scrape_current(self, chamber): if chamber == "upper": chambers = ["special_committees", "senate_committees"] else: chambers = ["house_committees"] committee_request = self.get(ksapi.url + "ctte/").text committee_json = json.loads(committee_request) for com_type in chambers: committees = committee_json["content"][com_type] for committee_data in committees: # set to joint if we are using the special_committees com_chamber = ("legislature" if com_type == "special_committees" else chamber) committee = Organization( committee_data["TITLE"], chamber=com_chamber, classification="committee", ) com_url = ksapi.url + "ctte/%s/" % committee_data["KPID"] try: detail_json = self.get(com_url).text except scrapelib.HTTPError: self.warning("error fetching committee %s" % com_url) continue details = json.loads(detail_json)["content"] for chair in details["CHAIR"]: if chair.get("FULLNAME", None): chair_name = chair["FULLNAME"] else: chair_name = self.parse_kpid(chair["KPID"]) self.warning("no FULLNAME for %s", chair["KPID"]) committee.add_member(chair_name, "chairman") for vicechair in details["VICECHAIR"]: committee.add_member(vicechair["FULLNAME"], "vice-chairman") for rankedmember in details["RMMEM"]: committee.add_member(rankedmember["FULLNAME"], "ranking member") for member in details["MEMBERS"]: committee.add_member(member["FULLNAME"]) if not committee._related: self.warning("skipping blank committee %s" % committee_data["TITLE"]) else: committee.add_source(com_url) yield committee
def scrape_lower_committee(self, link, name): url = re.sub(r"\s+", "", link.attrib["href"]) html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) comm = Organization(name=name, chamber="lower", classification="committee") comm.add_source(url) xpath = '//a[contains(@href, "?member=")]' for link in doc.xpath(xpath): name = link.text_content().strip() name = re.sub(r"^Delegate\s+", "", name) role = link.getnext().text or "member" comm.add_member(name, role.strip()) return comm
def scrape_lower_committee(self, name, parent, url): page = self.curl_lxmlize(url) if "Joint" in name or (parent and "Joint" in parent): chamber = "joint" else: chamber = "lower" if parent: comm = Organization( name=parent, chamber=chamber, classification="committee" ) subcomm = Organization( name=name, parent_id=comm, classification="committee" ) else: comm = Organization(name=name, chamber=chamber, classification="committee") comm.add_source(url) xpath = "//a[contains(@href, 'District')]" for link in page.xpath(xpath): member = link.xpath("string()").strip() member = re.sub(r"\s+", " ", member) if not member or member == "House District Maps": continue match = re.match(r"((Co-)?(Vice )?Chair)?Rep\. ([^\(]+)", member) member = match.group(4).strip() role = match.group(1) or "member" member = member.replace("Representative ", "") comm.add_member(member, role.lower()) if not comm._related: if subcomm.name == "test": # Whoopsie, prod data. return raise Exception("no members for %s (%s)" % (comm.name, subcomm.name)) yield comm
def scrape_lower_committee(self, committee_name, url): page = self.lxmlize(url) committee_name = committee_name.strip() comm = Organization(committee_name, chamber="lower", classification="committee") comm.add_source(url) info_node = self.get_node( page, './/div[@id = "dnn_ctr1109_ViewWebCommission_WebCommission1_' 'pnlCommission"]', ) # This will likely capture empty text nodes as well. members = self.get_nodes( info_node, './/div[@class="two-cols com"]/div[@class="col"]//text()' "[normalize-space() and preceding-sibling::br]", ) member_count = 0 for member in members: member = re.sub(r"Hon\.\s*", "", member).strip() # Skip empty nodes. if not member: continue member, title = self._match_title(member) if title is not None: comm.add_member(member, title) else: comm.add_member(member) member_count += 1 if member_count > 0: yield comm
def scrape(self, chamber=None): committees_url = "http://le.utah.gov/data/committees.json" committees = self.get(committees_url).json()["committees"] people_url = "http://le.utah.gov/data/legislators.json" people = self.get(people_url).json()["legislators"] # The committee JSON only has legislator IDs, not names ids_to_names = {} for person in people: ids_to_names[person["id"]] = person["formatName"] for committee in committees: name = committee["description"] if name.endswith(" Committee"): name = name[: len(name) - len(" Committee")] elif name.endswith(" Subcommittee"): name = name[: len(name) - len(" Subcommittee")] if name.startswith("House "): name = name[len("House ") :] chamber = "lower" elif name.startswith("Senate "): name = name[len("Senate ") :] chamber = "upper" else: chamber = "legislature" c = Organization(chamber=chamber, name=name, classification="committee") c.add_source(committees_url) c.add_source(people_url) c.add_link(committee["link"]) for member in committee["members"]: try: member_name = ids_to_names[member["id"]] except KeyError: self.warning( "Found unknown legislator ID in committee JSON: " + member["id"] ) c.add_member(member_name, role=member["position"]) yield c