def get_organizations(self): # Initialize the Organization class. Use keyword args to set the basic # properties. secretary_of_state = Organization( name="Office of the Secretary of State, State of Arizona", classification="office" ) # secretary_of_state.add_contact_detail( type="voice", value="602-542-4285" ) secretary_of_state.add_contact_detail( type="address", value="1700 W Washington St Fl 7, Phoenix AZ 85007-2808" ) secretary_of_state.add_link( url="http://www.azsos.gov/", note="Home page" ) self._secretary_of_state = secretary_of_state yield secretary_of_state
def test_full_organization(): org = ScrapeOrganization('United Nations', classification='international') org.add_identifier('un') org.add_name('UN', start_date='1945') org.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') org.add_link('http://example.com/link') org.add_source('http://example.com/source') # import org od = org.as_dict() OrganizationImporter('jurisdiction-id').import_data([od]) # get person from db and assert it imported correctly o = Organization.objects.get() assert 'ocd-organization' in o.id assert o.name == org.name assert o.identifiers.all()[0].identifier == 'un' assert o.identifiers.all()[0].scheme == '' assert o.other_names.all()[0].name == 'UN' assert o.other_names.all()[0].start_date == '1945' assert o.contact_details.all()[0].type == 'phone' assert o.contact_details.all()[0].value == '555-555-1234' assert o.contact_details.all()[0].note == 'this is fake' assert o.links.all()[0].url == 'http://example.com/link' assert o.sources.all()[0].url == 'http://example.com/source'
def scrape_committees(self, repos): for repo in repos: source = "https://raw.githubusercontent.com/unitedstates/congress-legislators/master/{0}".format(repo) committees = self.fetch_yaml(source) for committee in committees: org = Organization(committee["name"], classification="committee") org.add_source(source) for key in committee.keys() & {"url", "rss_url"}: org.add_link(committee[key]) for key in committee.keys() & {"phone", "address"}: org.add_contact_detail( type="voice", value=committee[key] ) if key == "phone" else org.add_contact_detail(type=key, value=committee[key]) for key in committee.keys() & {"senate_committee_id", "house_committee_id", "thomas_id"}: org.add_identifier(committee[key], scheme=key) if "subcommittees" in committee: for subcommittee in committee["subcommittees"]: sub_org = Organization(subcommittee["name"], classification="committee", parent_id=org._id) sub_org.add_identifier(subcommittee["thomas_id"], scheme="thomas") sub_org.add_source(source) for key in subcommittee.keys() & {"phone", "address"}: sub_org.add_contact_detail( type="voice", value=committee[key] ) if key == "phone" else sub_org.add_contact_detail(type=key, value=committee[key]) yield sub_org yield org
def test_full_organization(): create_jurisdictions() org = ScrapeOrganization('United Nations', classification='international') org.add_identifier('un') org.add_name('UN', start_date='1945') org.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') org.add_link('http://example.com/link') org.add_source('http://example.com/source') # import org od = org.as_dict() OrganizationImporter('jid1').import_data([od]) # get person from db and assert it imported correctly o = Organization.objects.get() assert 'ocd-organization' in o.id assert o.name == org.name assert o.identifiers.all()[0].identifier == 'un' assert o.identifiers.all()[0].scheme == '' assert o.other_names.all()[0].name == 'UN' assert o.other_names.all()[0].start_date == '1945' assert o.contact_details.all()[0].type == 'phone' assert o.contact_details.all()[0].value == '555-555-1234' assert o.contact_details.all()[0].note == 'this is fake' assert o.links.all()[0].url == 'http://example.com/link' assert o.sources.all()[0].url == 'http://example.com/source'
def scrape(self): com_url = "http://dccouncil.us/committees" data = self.get(com_url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(com_url) comms = set(doc.xpath('//a[contains(@href, "dccouncil.us/committees/")]')) for committee in comms: url = committee.attrib["href"] name = committee.text_content().strip() comm_data = self.get(url).text comm_page = lxml.html.fromstring(comm_data) comm_page.make_links_absolute(url) # classify these as belonging to the legislature committee = Organization( name=name, classification="committee", chamber="legislature" ) if comm_page.xpath('//p[@class="page-summary"]'): summary = ( comm_page.xpath('//p[@class="page-summary"]')[0] .text_content() .strip() ) committee.extras["summary"] = summary chair = comm_page.xpath("//h4[text()='Chairperson']/following-sibling::p") chair_name = chair[0].text_content().strip() chair_name = self.remove_title(chair_name) committee.add_member(chair_name, role="chair") members = comm_page.xpath( "//h4[text()='Councilmembers']/following-sibling::ul" ) members = members[0].xpath("./li") for m in members: mem_name = m.text_content().strip() mem_name = self.remove_title(mem_name) if mem_name != chair_name: committee.add_member(mem_name) committee.add_source(url) committee.add_link(url, note="Official Website") if not committee._related: self.warning("empty committee: %s;", name) else: yield committee
def _scrape_committee(self, committee_name, link, chamber): """Scrape individual committee page and add members""" page = self.get(link).text page = lxml.html.fromstring(page) page.make_links_absolute(link) is_subcommittee = bool(page.xpath('//li/a[text()="Committee"]')) if is_subcommittee: # All TN subcommittees are just the name of the parent committee with " Subcommittee" # at the end parent_committee_name = re.sub(r'\s*(Study )?Subcommittee\s*', '', committee_name) com = Organization( committee_name, classification='committee', parent_id=self.parents[parent_committee_name] ) else: com = Organization( committee_name, chamber=chamber, classification='committee', ) self.parents[committee_name] = com._id OFFICER_SEARCH = '//h2[contains(text(), "Committee Officers")]/' \ 'following-sibling::div/ul/li/a' MEMBER_SEARCH = '//h2[contains(text(), "Committee Members")]/' \ 'following-sibling::div/ul/li/a' for a in (page.xpath(OFFICER_SEARCH) + page.xpath(MEMBER_SEARCH)): member_name = ' '.join([ x.strip() for x in a.xpath('text()') + a.xpath('span/text()') if x.strip() ]) role = a.xpath('small') if role: role = role[0].xpath('text()')[0].strip() else: role = 'member' if '(Vacant)' in role: continue com.add_member(member_name, role) com.add_link(link) com.add_source(link) return com
def _scrape_committee(self, committee_name, link, chamber): """Scrape individual committee page and add members""" page = self.get(link).text page = lxml.html.fromstring(page) page.make_links_absolute(link) is_subcommittee = bool(page.xpath('//li/a[text()="Committee"]')) if is_subcommittee: # All TN subcommittees are just the name of the parent committee with " Subcommittee" # at the end parent_committee_name = re.sub(r'\s*Subcommittee\s*', '', committee_name) com = Organization( committee_name, classification='committee', parent_id={ 'name': parent_committee_name, 'classification': chamber }, ) else: com = Organization( committee_name, chamber=chamber, classification='committee', ) OFFICER_SEARCH = '//h2[contains(text(), "Committee Officers")]/' \ 'following-sibling::div/ul/li/a' MEMBER_SEARCH = '//h2[contains(text(), "Committee Members")]/' \ 'following-sibling::div/ul/li/a' for a in (page.xpath(OFFICER_SEARCH) + page.xpath(MEMBER_SEARCH)): member_name = ' '.join([ x.strip() for x in a.xpath('text()') + a.xpath('span/text()') if x.strip() ]) role = a.xpath('small') if role: role = role[0].xpath('text()')[0].strip() else: role = 'member' com.add_member(member_name, role) com.add_link(link) com.add_source(link) return com
def scrape(self): com_url = 'http://dccouncil.us/committees' data = self.get(com_url).text doc = lxml.html.fromstring(data) doc.make_links_absolute(com_url) comms = set( doc.xpath('//a[contains(@href, "dccouncil.us/committees/")]')) for committee in comms: url = committee.attrib['href'] name = committee.text_content().strip() comm_data = self.get(url).text comm_page = lxml.html.fromstring(comm_data) comm_page.make_links_absolute(url) # classify these as belonging to the legislature committee = Organization(name=name, classification='committee', chamber='legislature') if comm_page.xpath('//p[@class="page-summary"]'): summary = comm_page.xpath( '//p[@class="page-summary"]')[0].text_content().strip() committee.extras['summary'] = summary chair = comm_page.xpath( "//h4[text()='Chairperson']/following-sibling::p") chair_name = chair[0].text_content().strip() chair_name = self.remove_title(chair_name) committee.add_member(chair_name, role="chair") members = comm_page.xpath( "//h4[text()='Councilmembers']/following-sibling::ul") members = members[0].xpath("./li") for m in members: mem_name = m.text_content().strip() mem_name = self.remove_title(mem_name) if mem_name != chair_name: committee.add_member(mem_name) committee.add_source(url) committee.add_link(url, note='Official Website') if not committee._related: self.warning('empty committee: %s;', name) else: yield committee
def get_organizations(self): secretary_of_state = Organization( name="Office of the Secretary of State, State of California", classification="office") secretary_of_state.add_contact_detail(type="voice", value="916-653-6814") secretary_of_state.add_contact_detail( type="address", value="1500 11th Street, Sacramento, CA 95814") secretary_of_state.add_link(url="http://www.sos.ca.gov", note="Home page") self._secretary_of_state = secretary_of_state yield secretary_of_state
def _scrape_committee(self, committee_name, link, chamber): """Scrape individual committee page and add members""" page = self.get(link).text page = lxml.html.fromstring(page) page.make_links_absolute(link) is_subcommittee = bool(page.xpath('//li/a[text()="Committee"]')) if is_subcommittee: # All TN subcommittees are just the name of the parent committee with " Subcommittee" # at the end parent_committee_name = re.sub(r"\s*(Study )?Subcommittee\s*", "", committee_name) com = Organization( committee_name, classification="committee", parent_id=self.parents[parent_committee_name], ) else: com = Organization(committee_name, chamber=chamber, classification="committee") self.parents[committee_name] = com._id OFFICER_SEARCH = ('//h2[contains(text(), "Committee Officers")]/' "following-sibling::div/ul/li/a") MEMBER_SEARCH = ('//h2[contains(text(), "Committee Members")]/' "following-sibling::div/ul/li/a") for a in page.xpath(OFFICER_SEARCH) + page.xpath(MEMBER_SEARCH): member_name = " ".join([ x.strip() for x in a.xpath("text()") + a.xpath("span/text()") if x.strip() ]) role = a.xpath("small") if role: role = role[0].xpath("text()")[0].strip() else: role = "member" if "(Vacant)" in role: continue com.add_member(member_name, role) com.add_link(link) com.add_source(link) return com
def scrape(self, chamber=None): committees_url = 'http://le.utah.gov/data/committees.json' committees = self.get(committees_url).json()['committees'] people_url = 'http://le.utah.gov/data/legislators.json' people = self.get(people_url).json()['legislators'] # The committee JSON only has legislator IDs, not names ids_to_names = {} for person in people: ids_to_names[person['id']] = person['formatName'] for committee in committees: name = committee['description'] if name.endswith(' Committee'): name = name[:len(name) - len(' Committee')] elif name.endswith(' Subcommittee'): name = name[:len(name) - len(' Subcommittee')] if name.startswith('House '): name = name[len('House '):] chamber = 'lower' elif name.startswith('Senate '): name = name[len('Senate '):] chamber = 'upper' else: chamber = 'legislature' c = Organization( chamber=chamber, name=name, classification='committee' ) c.add_source(committees_url) c.add_source(people_url) c.add_link(committee['link']) for member in committee['members']: try: member_name = ids_to_names[member['id']] except KeyError: self.warning( "Found unknown legislator ID in committee JSON: " + member['id'] ) c.add_member(member_name, role=member['position']) yield c
def scrape_committees(self, repos): for repo in repos: source = "https://raw.githubusercontent.com/unitedstates/congress-legislators/master/{0}".format( repo) committees = self.fetch_yaml(source) for committee in committees: org = Organization(committee['name'], classification='committee') org.add_source(source) for key in committee.keys() & {'url', 'rss_url'}: org.add_link(committee[key]) for key in committee.keys() & {'phone', 'address'}: org.add_contact_detail( type='voice', value=committee[key] ) if key == 'phone' else org.add_contact_detail( type=key, value=committee[key]) for key in committee.keys() & { 'senate_committee_id', 'house_committee_id', 'thomas_id' }: org.add_identifier(committee[key], scheme=key) if 'subcommittees' in committee: for subcommittee in committee['subcommittees']: sub_org = Organization(subcommittee['name'], classification="committee", parent_id=org._id) sub_org.add_identifier(subcommittee['thomas_id'], scheme="thomas") sub_org.add_source(source) for key in subcommittee.keys() & {'phone', 'address'}: sub_org.add_contact_detail( type='voice', value=committee[key] ) if key == 'phone' else sub_org.add_contact_detail( type=key, value=committee[key]) yield sub_org yield org
def scrape(self, chamber=None): committees_url = 'http://le.utah.gov/data/committees.json' committees = self.get(committees_url).json()['committees'] people_url = 'http://le.utah.gov/data/legislators.json' people = self.get(people_url).json()['legislators'] # The committee JSON only has legislator IDs, not names ids_to_names = {} for person in people: ids_to_names[person['id']] = person['formatName'] for committee in committees: name = committee['description'] if name.endswith(' Committee'): name = name[:len(name) - len(' Committee')] elif name.endswith(' Subcommittee'): name = name[:len(name) - len(' Subcommittee')] if name.startswith('House '): name = name[len('House '):] chamber = 'lower' elif name.startswith('Senate '): name = name[len('Senate '):] chamber = 'upper' else: chamber = 'legislature' c = Organization(chamber=chamber, name=name, classification='committee') c.add_source(committees_url) c.add_source(people_url) c.add_link(committee['link']) for member in committee['members']: try: member_name = ids_to_names[member['id']] except KeyError: self.warning( "Found unknown legislator ID in committee JSON: " + member['id']) c.add_member(member_name, role=member['position']) yield c
def scrape(self, chamber=None): committees_url = "http://le.utah.gov/data/committees.json" committees = self.get(committees_url).json()["committees"] people_url = "http://le.utah.gov/data/legislators.json" people = self.get(people_url).json()["legislators"] # The committee JSON only has legislator IDs, not names ids_to_names = {} for person in people: ids_to_names[person["id"]] = person["formatName"] for committee in committees: name = committee["description"] if name.endswith(" Committee"): name = name[: len(name) - len(" Committee")] elif name.endswith(" Subcommittee"): name = name[: len(name) - len(" Subcommittee")] if name.startswith("House "): name = name[len("House ") :] chamber = "lower" elif name.startswith("Senate "): name = name[len("Senate ") :] chamber = "upper" else: chamber = "legislature" c = Organization(chamber=chamber, name=name, classification="committee") c.add_source(committees_url) c.add_source(people_url) c.add_link(committee["link"]) for member in committee["members"]: try: member_name = ids_to_names[member["id"]] except KeyError: self.warning( "Found unknown legislator ID in committee JSON: " + member["id"] ) c.add_member(member_name, role=member["position"]) yield c
def get_organizations(self): secretary_of_the_commonwealth = Organization( name="Office of the Secretary of the Commonwealth, Commonwealth of Virginia", classification="office" ) secretary_of_the_commonwealth.add_contact_detail( type="voice", value="804-786-2441" ) secretary_of_the_commonwealth.add_contact_detail( type="address", value="1111 East Broad Street, 4th Floor, Richmond, Virginia 23219" ) secretary_of_the_commonwealth.add_link( url="https://commonwealth.virginia.gov/", note="Home page" ) self._secretary_of_the_commonwealth = secretary_of_the_commonwealth yield secretary_of_the_commonwealth
def get_organizations(self): secretary_of_state = Organization( name="Office of the Secretary of State, State of California", classification="office" ) secretary_of_state.add_contact_detail( type="voice", value="916-653-6814" ) secretary_of_state.add_contact_detail( type="address", value="1500 11th Street, Sacramento, CA 95814" ) secretary_of_state.add_link( url="http://www.sos.ca.gov", note="Home page" ) self._secretary_of_state = secretary_of_state yield secretary_of_state
def scrape_joint_committee(self, committee_name, url): if "state.tn.us" in url: com = Organization(committee_name, chamber="legislature", classification="committee") try: page = self.get(url).text except requests.exceptions.ConnectionError: self.logger.warning("Committee link is broken, skipping") return page = lxml.html.fromstring(page) for el in page.xpath( "//div[@class='Blurb']/table//tr[2 <= position() and position() < 10]/td[1]" ): if el.xpath("text()") == ["Vacant"]: continue (member_name, ) = el.xpath("a/text()") if el.xpath("text()"): role = el.xpath("text()")[0].strip(" ,") else: role = "member" member_name = member_name.replace("Senator", "") member_name = member_name.replace("Representative", "") member_name = member_name.strip() com.add_member(member_name, role) com.add_link(url) com.add_source(url) return com elif "gov-opps" in url: com = Organization(committee_name, chamber="legislature", classification="committee") page = self.get(url).text page = lxml.html.fromstring(page) links = ["senate", "house"] for link in links: chamber_link = self.base_href + "/" + link + "/committees/gov-opps.html" chamber_page = self.get(chamber_link).text chamber_page = lxml.html.fromstring(chamber_page) OFFICER_SEARCH = ( '//h2[contains(text(), "Committee Officers")]/' "following-sibling::div/ul/li/a") MEMBER_SEARCH = ('//h2[contains(text(), "Committee Members")]/' "following-sibling::div/ul/li/a") for a in chamber_page.xpath( OFFICER_SEARCH) + chamber_page.xpath(MEMBER_SEARCH): member_name = " ".join( [x.strip() for x in a.xpath(".//text()") if x.strip()]) role = a.xpath("small") if role: role = role[0].xpath("text()")[0].strip() member_name = member_name.replace(role, "").strip() else: role = "member" com.add_member(member_name, role) com.add_source(chamber_link) com.add_link(url) com.add_source(url) return com else: return self._scrape_committee(committee_name, url, "legislature")
def scrape_joint_committee(self, committee_name, url): if 'state.tn.us' in url: com = Organization( committee_name, chamber='legislature', classification='committee', ) try: page = self.get(url).text except requests.exceptions.ConnectionError: self.logger.warning("Committee link is broken, skipping") return page = lxml.html.fromstring(page) for el in page.xpath( "//div[@class='Blurb']/table//tr[2 <= position() and position() < 10]/td[1]" ): if el.xpath('text()') == ['Vacant']: continue (member_name, ) = el.xpath('a/text()') if el.xpath('text()'): role = el.xpath('text()')[0].strip(' ,') else: role = 'member' member_name = member_name.replace('Senator', '') member_name = member_name.replace('Representative', '') member_name = member_name.strip() com.add_member(member_name, role) com.add_link(url) com.add_source(url) return com elif 'gov-opps' in url: com = Organization( committee_name, chamber='legislature', classification='committee', ) page = self.get(url).text page = lxml.html.fromstring(page) links = ['senate', 'house'] for link in links: chamber_link = self.base_href + '/' + link + '/committees/gov-opps.html' chamber_page = self.get(chamber_link).text chamber_page = lxml.html.fromstring(chamber_page) OFFICER_SEARCH = '//h2[contains(text(), "Committee Officers")]/' \ 'following-sibling::div/ul/li/a' MEMBER_SEARCH = '//h2[contains(text(), "Committee Members")]/' \ 'following-sibling::div/ul/li/a' for a in (chamber_page.xpath(OFFICER_SEARCH) + chamber_page.xpath(MEMBER_SEARCH)): member_name = ' '.join( [x.strip() for x in a.xpath('.//text()') if x.strip()]) role = a.xpath('small') if role: role = role[0].xpath('text()')[0].strip() member_name = member_name.replace(role, '').strip() else: role = 'member' com.add_member(member_name, role) com.add_source(chamber_link) com.add_link(url) com.add_source(url) return com else: return self._scrape_committee(committee_name, url, 'legislature')
def get_organizations(self): legislature = Organization("United States Congress", classification='legislature') self._legislature = legislature yield legislature senate = Organization( name="United States Senate", classification='upper', parent_id=legislature._id, ) self._senate = senate yield senate house = Organization( name="United States House", classification='lower', parent_id=legislature._id, ) self._house = house yield house sopr = Organization( name="Office of Public Record, US Senate", classification="office", parent_id=senate._id, ) sopr.add_contact_detail(type="voice", value="202-224-0322") sopr.add_source(url="http://www.senate.gov/pagelayout/legislative/" "one_item_and_teasers/opr.htm", note="Profile page") sopr.add_source(url="http://www.senate.gov/pagelayout/legislative/" "g_three_sections_with_teasers/lobbyingdisc.htm" "#lobbyingdisc=lda", note="Disclosure Home") sopr.add_link(url="http://soprweb.senate.gov/index.cfm" "?event=selectfields", note="Disclosure Search Portal") sopr.add_link(url="http://soprweb.senate.gov/", note="Disclosure Electronic Filing System") self._sopr = sopr yield sopr house_clerk = Organization( name="Office of the Clerk, US House", classification="office", parent_id=house._id, ) house_clerk.add_contact_detail(type="voice", value="202-225-7000") house_clerk.add_source(url="http://clerk.house.gov/", note="Home page") self._house_clerk = house_clerk yield house_clerk yield legislature
def get_organizations(self): legislature = Organization("United States Congress", classification='legislature') self._legislature = legislature yield legislature senate = Organization( name="United States Senate", classification='upper', parent_id=legislature._id, ) self._senate = senate yield senate house = Organization( name="United States House", classification='lower', parent_id=legislature._id, ) self._house = house yield house sopr = Organization( name="Office of Public Record, US Senate", classification="office", parent_id=senate._id, ) sopr.add_contact_detail(type="voice", value="202-224-0322") sopr.add_source(url="http://www.senate.gov/pagelayout/legislative/" "one_item_and_teasers/opr.htm", note="Profile page") sopr.add_source(url="http://www.senate.gov/pagelayout/legislative/" "g_three_sections_with_teasers/lobbyingdisc.htm" "#lobbyingdisc=lda", note="Disclosure Home") sopr.add_link(url="http://soprweb.senate.gov/index.cfm" "?event=selectfields", note="Disclosure Search Portal") sopr.add_link(url="http://soprweb.senate.gov/", note="Disclosure Electronic Filing System") self._sopr = sopr yield sopr house_clerk = Organization( name="Office of the Clerk, US House", classification="office", parent_id=house._id, ) house_clerk.add_contact_detail(type="voice", value="202-225-7000") house_clerk.add_source(url="http://clerk.house.gov/", note="Home page") self._house_clerk = house_clerk yield house_clerk yield legislature
def scrape_joint_committee(self, committee_name, url): if 'state.tn.us' in url: com = Organization( committee_name, chamber='joint', classification='committee', ) try: page = self.get(url).text except requests.exceptions.ConnectionError: self.logger.warning("Committee link is broken, skipping") return page = lxml.html.fromstring(page) for el in page.xpath( "//div[@class='Blurb']/table//tr[2 <= position() and position() < 10]/td[1]" ): if el.xpath('text()') == ['Vacant']: continue (member_name, ) = el.xpath('a/text()') if el.xpath('text()'): role = el.xpath('text()')[0].strip(' ,') else: role = 'member' member_name = member_name.replace('Senator', '') member_name = member_name.replace('Representative', '') member_name = member_name.strip() com.add_member(member_name, role) com.add_link(url) com.add_source(url) return com elif 'gov-opps' in url: com = Organization( committee_name, chamber='joint', classification='committee', ) page = self.get(url).text page = lxml.html.fromstring(page) links = ['senate', 'house'] for link in links: chamber_link = self.base_href + '/' + link + '/committees/gov-opps.html' chamber_page = self.get(chamber_link).text chamber_page = lxml.html.fromstring(chamber_page) OFFICER_SEARCH = '//h2[contains(text(), "Committee Officers")]/' \ 'following-sibling::div/ul/li/a' MEMBER_SEARCH = '//h2[contains(text(), "Committee Members")]/' \ 'following-sibling::div/ul/li/a' for a in ( chamber_page.xpath(OFFICER_SEARCH) + chamber_page.xpath(MEMBER_SEARCH) ): member_name = ' '.join([ x.strip() for x in a.xpath('.//text()') if x.strip() ]) role = a.xpath('small') if role: role = role[0].xpath('text()')[0].strip() member_name = member_name.replace(role, '').strip() else: role = 'member' com.add_member(member_name, role) com.add_source(chamber_link) com.add_link(url) com.add_source(url) return com else: return self._scrape_committee(committee_name, url, 'joint')