def scrape(self): sessions = reversed(self.jurisdiction.legislative_sessions) committee_term_instances = committees_from_sessions(self, sessions) committees_by_code = build_lookup_dict( self, data_list=committee_term_instances, index_key='code') for code, instances in committees_by_code.items(): # TODO: Figure out how to edit city council org. if code == 'CC': continue extras = {'tmmis_decision_body_ids': []} for i, inst in enumerate(instances): # TODO: Ensure this survives addition of new term (2017) # so specific year always creates canonical_i = 0 if i == canonical_i: o = Organization(name=inst['name'], classification='committee') extras.update({'description': inst['info']}) o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME) extras['tmmis_decision_body_ids'].append( {inst['term']: inst['decision_body_id']}) o.extras = extras o.add_source(inst['source_url']) if instances[canonical_i]['name'] != inst['name']: # TODO: Add start_date and end_date o.add_name(inst['name']) yield o
def scrape(self): sessions = reversed(self.jurisdiction.legislative_sessions) committee_term_instances = committees_from_sessions(self, sessions) committees_by_code = build_lookup_dict(self, data_list=committee_term_instances, index_key='code') for code, instances in committees_by_code.items(): # TODO: Figure out how to edit city council org. if code == 'CC': continue extras = {'tmmis_decision_body_ids': []} for i, inst in enumerate(instances): # TODO: Ensure this survives addition of new term (2017) # so specific year always creates canonical_i = 0 if i == canonical_i: o = Organization(name=inst['name'], classification='committee') extras.update({'description': inst['info']}) o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME) extras['tmmis_decision_body_ids'].append({inst['term']: inst['decision_body_id']}) o.extras = extras o.add_source(inst['source_url']) if instances[canonical_i]['name'] != inst['name']: # TODO: Add start_date and end_date o.add_name(inst['name']) yield o
def scrape_committees(self, repos): for repo in repos: source = "https://raw.githubusercontent.com/unitedstates/congress-legislators/master/{0}".format(repo) committees = self.fetch_yaml(source) for committee in committees: org = Organization(committee["name"], classification="committee") org.add_source(source) for key in committee.keys() & {"url", "rss_url"}: org.add_link(committee[key]) for key in committee.keys() & {"phone", "address"}: org.add_contact_detail( type="voice", value=committee[key] ) if key == "phone" else org.add_contact_detail(type=key, value=committee[key]) for key in committee.keys() & {"senate_committee_id", "house_committee_id", "thomas_id"}: org.add_identifier(committee[key], scheme=key) if "subcommittees" in committee: for subcommittee in committee["subcommittees"]: sub_org = Organization(subcommittee["name"], classification="committee", parent_id=org._id) sub_org.add_identifier(subcommittee["thomas_id"], scheme="thomas") sub_org.add_source(source) for key in subcommittee.keys() & {"phone", "address"}: sub_org.add_contact_detail( type="voice", value=committee[key] ) if key == "phone" else sub_org.add_contact_detail(type=key, value=committee[key]) yield sub_org yield org
def test_full_organization(): create_jurisdictions() org = ScrapeOrganization('United Nations', classification='international') org.add_identifier('un') org.add_name('UN', start_date='1945') org.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') org.add_link('http://example.com/link') org.add_source('http://example.com/source') # import org od = org.as_dict() OrganizationImporter('jid1').import_data([od]) # get person from db and assert it imported correctly o = Organization.objects.get() assert 'ocd-organization' in o.id assert o.name == org.name assert o.identifiers.all()[0].identifier == 'un' assert o.identifiers.all()[0].scheme == '' assert o.other_names.all()[0].name == 'UN' assert o.other_names.all()[0].start_date == '1945' assert o.contact_details.all()[0].type == 'phone' assert o.contact_details.all()[0].value == '555-555-1234' assert o.contact_details.all()[0].note == 'this is fake' assert o.links.all()[0].url == 'http://example.com/link' assert o.sources.all()[0].url == 'http://example.com/source'
def test_full_organization(): org = ScrapeOrganization('United Nations', classification='international') org.add_identifier('un') org.add_name('UN', start_date='1945') org.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') org.add_link('http://example.com/link') org.add_source('http://example.com/source') # import org od = org.as_dict() OrganizationImporter('jurisdiction-id').import_data([od]) # get person from db and assert it imported correctly o = Organization.objects.get() assert 'ocd-organization' in o.id assert o.name == org.name assert o.identifiers.all()[0].identifier == 'un' assert o.identifiers.all()[0].scheme == '' assert o.other_names.all()[0].name == 'UN' assert o.other_names.all()[0].start_date == '1945' assert o.contact_details.all()[0].type == 'phone' assert o.contact_details.all()[0].value == '555-555-1234' assert o.contact_details.all()[0].note == 'this is fake' assert o.links.all()[0].url == 'http://example.com/link' assert o.sources.all()[0].url == 'http://example.com/source'
def scrape(self): sessions = reversed(self.jurisdiction.legislative_sessions) committee_term_instances = committees_from_sessions(self, sessions) committees_by_code = build_lookup_dict( self, data_list=committee_term_instances, index_key='code') for code, instances in committees_by_code.items(): # TODO: Figure out how to edit city council org. if code == 'CC': continue # When there are no meetings scheduled and was no way to deduce committee code. if not code: continue extras = {'tmmis_decision_body_ids': []} for i, inst in enumerate(instances): # TODO: Ensure this survives addition of new term (2017) # so specific year always creates canonical_i = 0 if i == canonical_i: o = Organization(name=inst['name'], classification='committee') extras.update({'description': inst['info']}) o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME) # TODO: Scrape non-councillor members meeting_id = self.referenceMeetingId( inst['code'], inst['term']) if meeting_id: seen_posts = [] membership_url = MEMBERSHIP_URL_TEMPLATE.format( meeting_id) for councillor in self.councillorMembers( membership_url): o.add_member(councillor['name'], councillor['role']) if councillor['role'] not in seen_posts: # TODO: More specific divisions for some committee? o.add_post( role=councillor['role'], label=councillor['role'], division_id=self.jurisdiction.division_id) seen_posts.append(councillor['role']) extras['tmmis_decision_body_ids'].append( {inst['term']: inst['decision_body_id']}) o.extras = extras o.add_source(inst['source_url']) if instances[canonical_i]['name'] != inst['name']: # TODO: Add start_date and end_date o.add_name(inst['name']) yield o
def scrape(self): sessions = reversed(self.jurisdiction.legislative_sessions) committee_term_instances = committees_from_sessions(self, sessions) committees_by_code = build_lookup_dict(self, data_list=committee_term_instances, index_key='code') for code, instances in committees_by_code.items(): # TODO: Figure out how to edit city council org. if code == 'CC': continue # When there are no meetings scheduled and was no way to deduce committee code. if not code: continue extras = {'tmmis_decision_body_ids': []} for i, inst in enumerate(instances): # TODO: Ensure this survives addition of new term (2017) # so specific year always creates canonical_i = 0 if i == canonical_i: o = Organization(name=inst['name'], classification='committee') extras.update({'description': inst['info']}) o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME) # TODO: Scrape non-councillor members meeting_id = self.referenceMeetingId(inst['code'], inst['term']) if meeting_id: seen_posts = [] membership_url = MEMBERSHIP_URL_TEMPLATE.format(meeting_id) for councillor in self.councillorMembers(membership_url): o.add_member(councillor['name'], councillor['role']) if councillor['role'] not in seen_posts: o.add_post( role=councillor['role'], label=councillor['role'], # TODO: More specific divisions for some committee? division_id=self.jurisdiction.division_id, ) seen_posts.append(councillor['role']) extras['tmmis_decision_body_ids'].append({inst['term']: inst['decision_body_id']}) o.extras = extras o.add_source(inst['source_url']) if instances[canonical_i]['name'] != inst['name']: # TODO: Add start_date and end_date o.add_name(inst['name']) yield o
def scrape_committees(self, repos): for repo in repos: source = "https://raw.githubusercontent.com/unitedstates/congress-legislators/master/{0}".format( repo) committees = self.fetch_yaml(source) for committee in committees: org = Organization(committee['name'], classification='committee') org.add_source(source) for key in committee.keys() & {'url', 'rss_url'}: org.add_link(committee[key]) for key in committee.keys() & {'phone', 'address'}: org.add_contact_detail( type='voice', value=committee[key] ) if key == 'phone' else org.add_contact_detail( type=key, value=committee[key]) for key in committee.keys() & { 'senate_committee_id', 'house_committee_id', 'thomas_id' }: org.add_identifier(committee[key], scheme=key) if 'subcommittees' in committee: for subcommittee in committee['subcommittees']: sub_org = Organization(subcommittee['name'], classification="committee", parent_id=org._id) sub_org.add_identifier(subcommittee['thomas_id'], scheme="thomas") sub_org.add_source(source) for key in subcommittee.keys() & {'phone', 'address'}: sub_org.add_contact_detail( type='voice', value=committee[key] ) if key == 'phone' else sub_org.add_contact_detail( type=key, value=committee[key]) yield sub_org yield org
def scrape(self, session=None): if session is None: session = self.latest_session() self.info('no session specified, using %s', session) # com_types = ['J', 'SE', 'O'] # base_url = 'https://wyoleg.gov/LsoService/api/committeeList/2018/J' url = 'https://wyoleg.gov/LsoService/api/committees/{}'.format(session) response = self.get(url) coms_json = json.loads(response.content.decode('utf-8')) for row in coms_json: com_url = 'https://wyoleg.gov/LsoService/api/committeeDetail/{}/{}'.format( session, row['ownerID']) com_response = self.get(com_url) com = json.loads(com_response.content.decode('utf-8')) # WY doesn't seem to have any house/senate only committees that I can find committee = Organization(name=com['commName'], chamber='legislature', classification='committee') for member in com['commMembers']: role = 'chairman' if member[ 'chairman'] == 'Chairman' else 'member' committee.add_member(member['name'], role) # some WY committees have non-legislators appointed to the member by the Governor # but the formatting is super inconsistent if com['otherMembers']: committee.extras['other_members'] = com['otherMembers'] committee.extras['wy_id'] = com['commID'] committee.extras['wy_code'] = com['ownerID'] committee.extras['wy_type_code'] = com['type'] committee.extras['budget'] = com['budget'] if com['statAuthority']: committee.extras['statutory_authority'] = com['statAuthority'] if com['number']: committee.extras['seat_distribution'] = com['number'] committee.add_identifier(scheme='WY Committee ID', identifier=str(com['commID'])) committee.add_identifier(scheme='WY Committee Code', identifier=str(com['ownerID'])) if com['description']: committee.add_identifier(scheme='Common Name', identifier=com['description']) source_url = 'http://wyoleg.gov/Committees/{}/{}'.format( session, com['ownerID']) committee.add_source(source_url) yield committee
def scrape(self, session=None): if session is None: session = self.latest_session() self.info("no session specified, using %s", session) # com_types = ['J', 'SE', 'O'] # base_url = 'https://wyoleg.gov/LsoService/api/committeeList/2018/J' url = "https://wyoleg.gov/LsoService/api/committees/{}".format(session) response = self.get(url) coms_json = json.loads(response.content.decode("utf-8")) for row in coms_json: com_url = "https://wyoleg.gov/LsoService/api/committeeDetail/{}/{}".format( session, row["ownerID"]) com_response = self.get(com_url) com = json.loads(com_response.content.decode("utf-8")) # WY doesn't seem to have any house/senate only committees that I can find committee = Organization(name=com["commName"], chamber="legislature", classification="committee") for member in com["commMembers"]: role = "chairman" if member[ "chairman"] == "Chairman" else "member" committee.add_member(member["name"], role) # some WY committees have non-legislators appointed to the member by the Governor # but the formatting is super inconsistent if com["otherMembers"]: committee.extras["other_members"] = com["otherMembers"] committee.extras["wy_id"] = com["commID"] committee.extras["wy_code"] = com["ownerID"] committee.extras["wy_type_code"] = com["type"] committee.extras["budget"] = com["budget"] if com["statAuthority"]: committee.extras["statutory_authority"] = com["statAuthority"] if com["number"]: committee.extras["seat_distribution"] = com["number"] committee.add_identifier(scheme="WY Committee ID", identifier=str(com["commID"])) committee.add_identifier(scheme="WY Committee Code", identifier=str(com["ownerID"])) if com["description"]: committee.add_identifier(scheme="Common Name", identifier=com["description"]) source_url = "http://wyoleg.gov/Committees/{}/{}".format( session, com["ownerID"]) committee.add_source(source_url) yield committee
def scrape(self, session=None): if session is None: session = self.latest_session() self.info('no session specified, using %s', session) # com_types = ['J', 'SE', 'O'] # base_url = 'https://wyoleg.gov/LsoService/api/committeeList/2018/J' url = 'https://wyoleg.gov/LsoService/api/committees/{}'.format(session) response = self.get(url) coms_json = json.loads(response.content.decode('utf-8')) for row in coms_json: com_url = 'https://wyoleg.gov/LsoService/api/committeeDetail/{}/{}'.format( session, row['ownerID']) com_response = self.get(com_url) com = json.loads(com_response.content.decode('utf-8')) # WY doesn't seem to have any house/senate only committees that I can find committee = Organization( name=com['commName'], chamber='legislature', classification='committee') for member in com['commMembers']: role = 'chairman' if member['chairman'] == 'Chairman' else 'member' committee.add_member(member['name'], role) # some WY committees have non-legislators appointed to the member by the Governor # but the formatting is super inconsistent if com['otherMembers']: committee.extras['other_members'] = com['otherMembers'] committee.extras['wy_id'] = com['commID'] committee.extras['wy_code'] = com['ownerID'] committee.extras['wy_type_code'] = com['type'] committee.extras['budget'] = com['budget'] if com['statAuthority']: committee.extras['statutory_authority'] = com['statAuthority'] if com['number']: committee.extras['seat_distribution'] = com['number'] committee.add_identifier( scheme='WY Committee ID', identifier=str(com['commID'])) committee.add_identifier( scheme='WY Committee Code', identifier=str(com['ownerID'])) if com['description']: committee.add_identifier( scheme='Common Name', identifier=com['description']) source_url = 'http://wyoleg.gov/Committees/{}/{}'.format( session, com['ownerID']) committee.add_source(source_url) yield committee
def scrape_committee(self, committee_id): old = self.api('committees/' + committee_id + '?') id = old.pop('id') old.pop('created_at') old.pop('updated_at') old.pop('country', None) old.pop('level', None) old.pop('state') old.pop('votesmart_id', None) old.pop('+short_name', None) old.pop('+session', None) old.pop('+az_committee_id', None) com = old.pop('committee') sub = old.pop('subcommittee') parent_id = old.pop('parent_id') chamber = old.pop('chamber') if chamber == 'joint': chamber = '' if self.state in('ne', 'dc'): chamber = 'legislature' if sub: if parent_id: parent = self._committees[parent_id]._id new = Organization(sub, parent_id=parent, classification='committee') else: new = Organization(com + ': ' + sub, chamber=chamber, classification='committee') else: new = Organization(com, chamber=chamber, classification='committee') assert parent_id is None # all_ids for id in old.pop('all_ids'): new.add_identifier(id, scheme='openstates') self._committees[id] = new # sources for source in old.pop('sources'): new.add_source(**source) # members start, end = self.get_term_years() for role in old.pop('members'): # leg_id, com_id, role, start, end if role['leg_id']: self._roles.add((role['leg_id'], id, role['role'], start, end)) to_extras = ['+twitter', '+description', '+code', '+secretary', '+office_hours', '+office_phone', '+meetings_info', '+status', '+aide', '+contact_info', '+comm_type', 'comm_type', 'aide', 'contact_info', '+town_represented', '+action_code', ] for k in to_extras: v = old.pop(k, None) if v: new.extras[k.replace('+', '')] = v assert not old, old.keys() return new
def scrape_committee(self, committee_id): old = self.api('committees/' + committee_id + '?') id = old.pop('id') old.pop('created_at') old.pop('updated_at') old.pop('country', None) old.pop('level', None) old.pop('state') old.pop('votesmart_id', None) old.pop('+short_name', None) old.pop('+session', None) old.pop('+az_committee_id', None) com = old.pop('committee') sub = old.pop('subcommittee') parent_id = old.pop('parent_id') chamber = old.pop('chamber') if chamber == 'joint': chamber = '' if self.state in ('ne', 'dc'): chamber = 'legislature' if sub: if parent_id: parent = self._committees[parent_id]._id new = Organization(sub, parent_id=parent, classification='committee') else: new = Organization(com + ': ' + sub, chamber=chamber, classification='committee') else: new = Organization(com, chamber=chamber, classification='committee') assert parent_id is None # all_ids for id in old.pop('all_ids'): new.add_identifier(id, scheme='openstates') self._committees[id] = new # sources for source in old.pop('sources'): new.add_source(**source) # members start, end = self.get_term_years() for role in old.pop('members'): # leg_id, com_id, role, start, end if role['leg_id']: self._roles.add((role['leg_id'], id, role['role'], start, end)) to_extras = [ '+twitter', '+description', '+code', '+secretary', '+office_hours', '+office_phone', '+meetings_info', '+status', '+aide', '+contact_info', '+comm_type', 'comm_type', 'aide', 'contact_info', '+town_represented', '+action_code', ] for k in to_extras: v = old.pop(k, None) if v: new.extras[k.replace('+', '')] = v assert not old, old.keys() return new
def scrape(self): current_path = Path(__file__) committee_path = current_path.parent / 'congress-legislators/committees-historical.yaml' with committee_path.open() as f: committees = yaml.load(f, Loader=yaml.CLoader) for committee in committees: if committee['type'] == 'house': chamber = 'lower' elif committee['type'] == 'senate': chamber = 'upper' else: print(committee) raise names_int = { int(key): name for key, name in committee['names'].items() } _, current_name = max(names_int.items()) if chamber == 'lower': current_name = 'House Committee on ' + current_name elif chamber == 'upper': current_name = 'Senate Committee on ' + current_name c = Organization(current_name, classification='committee', chamber=chamber) start, end = duration(committee) c.founding_date = start if end: c.dissolution_date = end c.add_identifier(committee['thomas_id'] + '00', scheme='thomas_id') if 'house_committee_id' in committee: c.add_identifier(committee['house_committee_id'], scheme='house_committee_id') if 'senate_committee_id' in committee: c.add_identifier(committee['senate_committee_id'], scheme='senate_committee_id') c.add_source( 'https://github.com/unitedstates/congress-legislators/blob/master/committees-historical.yaml' ) for name in set(committee['names'].values()): if chamber == 'lower': name = 'House Committee on ' + name elif chamber == 'upper': name = 'Senate Committee on ' + name if name != c.name: c.add_name(name) yield c for subcommittee in committee.get('subcommittees', []): names_int = { int(key): name for key, name in subcommittee['names'].items() } _, current_name = max(names_int.items()) sc = Organization('Subcommittee on ' + current_name, classification='committee', parent_id=c) start, end = duration(subcommittee) sc.founding_date = start if end: sc.dissolution_date = end thomas_id = (committee['thomas_id'] + subcommittee['thomas_id']) sc.add_identifier(thomas_id, scheme='thomas_id') sc.add_source( 'https://github.com/unitedstates/congress-legislators/blob/master/committees-historical.yaml' ) if thomas_id == 'SSJU12': sc.add_identifier('SSJU15', scheme='thomas_id') elif thomas_id == 'SSJU15': continue if 'Oversight and Investigations' in sc.name: print(thomas_id) #input() for name in set(subcommittee['names'].values()): name = 'Subcommittee on ' + name if name != sc.name: sc.add_name(name) yield sc