def test_full_organization(): create_jurisdictions() org = ScrapeOrganization('United Nations', classification='international') org.add_identifier('un') org.add_name('UN', start_date='1945') org.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') org.add_link('http://example.com/link') org.add_source('http://example.com/source') # import org od = org.as_dict() OrganizationImporter('jid1').import_data([od]) # get person from db and assert it imported correctly o = Organization.objects.get() assert 'ocd-organization' in o.id assert o.name == org.name assert o.identifiers.all()[0].identifier == 'un' assert o.identifiers.all()[0].scheme == '' assert o.other_names.all()[0].name == 'UN' assert o.other_names.all()[0].start_date == '1945' assert o.contact_details.all()[0].type == 'phone' assert o.contact_details.all()[0].value == '555-555-1234' assert o.contact_details.all()[0].note == 'this is fake' assert o.links.all()[0].url == 'http://example.com/link' assert o.sources.all()[0].url == 'http://example.com/source'
def scrape(self): sessions = reversed(self.jurisdiction.legislative_sessions) committee_term_instances = committees_from_sessions(self, sessions) committees_by_code = build_lookup_dict( self, data_list=committee_term_instances, index_key='code') for code, instances in committees_by_code.items(): # TODO: Figure out how to edit city council org. if code == 'CC': continue extras = {'tmmis_decision_body_ids': []} for i, inst in enumerate(instances): # TODO: Ensure this survives addition of new term (2017) # so specific year always creates canonical_i = 0 if i == canonical_i: o = Organization(name=inst['name'], classification='committee') extras.update({'description': inst['info']}) o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME) extras['tmmis_decision_body_ids'].append( {inst['term']: inst['decision_body_id']}) o.extras = extras o.add_source(inst['source_url']) if instances[canonical_i]['name'] != inst['name']: # TODO: Add start_date and end_date o.add_name(inst['name']) yield o
def test_full_organization(): org = ScrapeOrganization('United Nations', classification='international') org.add_identifier('un') org.add_name('UN', start_date='1945') org.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') org.add_link('http://example.com/link') org.add_source('http://example.com/source') # import org od = org.as_dict() OrganizationImporter('jurisdiction-id').import_data([od]) # get person from db and assert it imported correctly o = Organization.objects.get() assert 'ocd-organization' in o.id assert o.name == org.name assert o.identifiers.all()[0].identifier == 'un' assert o.identifiers.all()[0].scheme == '' assert o.other_names.all()[0].name == 'UN' assert o.other_names.all()[0].start_date == '1945' assert o.contact_details.all()[0].type == 'phone' assert o.contact_details.all()[0].value == '555-555-1234' assert o.contact_details.all()[0].note == 'this is fake' assert o.links.all()[0].url == 'http://example.com/link' assert o.sources.all()[0].url == 'http://example.com/source'
def scrape(self): sessions = reversed(self.jurisdiction.legislative_sessions) committee_term_instances = committees_from_sessions(self, sessions) committees_by_code = build_lookup_dict(self, data_list=committee_term_instances, index_key='code') for code, instances in committees_by_code.items(): # TODO: Figure out how to edit city council org. if code == 'CC': continue extras = {'tmmis_decision_body_ids': []} for i, inst in enumerate(instances): # TODO: Ensure this survives addition of new term (2017) # so specific year always creates canonical_i = 0 if i == canonical_i: o = Organization(name=inst['name'], classification='committee') extras.update({'description': inst['info']}) o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME) extras['tmmis_decision_body_ids'].append({inst['term']: inst['decision_body_id']}) o.extras = extras o.add_source(inst['source_url']) if instances[canonical_i]['name'] != inst['name']: # TODO: Add start_date and end_date o.add_name(inst['name']) yield o
def test_deduplication_other_name_overlaps(): create_jurisdictions() create_org() org = ScrapeOrganization('The United Nations', classification='international') org.add_name('United Nations') od = org.as_dict() OrganizationImporter('jid1').import_data([od]) assert Organization.objects.all().count() == 1
def scrape(self): sessions = reversed(self.jurisdiction.legislative_sessions) committee_term_instances = committees_from_sessions(self, sessions) committees_by_code = build_lookup_dict( self, data_list=committee_term_instances, index_key='code') for code, instances in committees_by_code.items(): # TODO: Figure out how to edit city council org. if code == 'CC': continue # When there are no meetings scheduled and was no way to deduce committee code. if not code: continue extras = {'tmmis_decision_body_ids': []} for i, inst in enumerate(instances): # TODO: Ensure this survives addition of new term (2017) # so specific year always creates canonical_i = 0 if i == canonical_i: o = Organization(name=inst['name'], classification='committee') extras.update({'description': inst['info']}) o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME) # TODO: Scrape non-councillor members meeting_id = self.referenceMeetingId( inst['code'], inst['term']) if meeting_id: seen_posts = [] membership_url = MEMBERSHIP_URL_TEMPLATE.format( meeting_id) for councillor in self.councillorMembers( membership_url): o.add_member(councillor['name'], councillor['role']) if councillor['role'] not in seen_posts: # TODO: More specific divisions for some committee? o.add_post( role=councillor['role'], label=councillor['role'], division_id=self.jurisdiction.division_id) seen_posts.append(councillor['role']) extras['tmmis_decision_body_ids'].append( {inst['term']: inst['decision_body_id']}) o.extras = extras o.add_source(inst['source_url']) if instances[canonical_i]['name'] != inst['name']: # TODO: Add start_date and end_date o.add_name(inst['name']) yield o
def scrape(self): sessions = reversed(self.jurisdiction.legislative_sessions) committee_term_instances = committees_from_sessions(self, sessions) committees_by_code = build_lookup_dict(self, data_list=committee_term_instances, index_key='code') for code, instances in committees_by_code.items(): # TODO: Figure out how to edit city council org. if code == 'CC': continue # When there are no meetings scheduled and was no way to deduce committee code. if not code: continue extras = {'tmmis_decision_body_ids': []} for i, inst in enumerate(instances): # TODO: Ensure this survives addition of new term (2017) # so specific year always creates canonical_i = 0 if i == canonical_i: o = Organization(name=inst['name'], classification='committee') extras.update({'description': inst['info']}) o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME) # TODO: Scrape non-councillor members meeting_id = self.referenceMeetingId(inst['code'], inst['term']) if meeting_id: seen_posts = [] membership_url = MEMBERSHIP_URL_TEMPLATE.format(meeting_id) for councillor in self.councillorMembers(membership_url): o.add_member(councillor['name'], councillor['role']) if councillor['role'] not in seen_posts: o.add_post( role=councillor['role'], label=councillor['role'], # TODO: More specific divisions for some committee? division_id=self.jurisdiction.division_id, ) seen_posts.append(councillor['role']) extras['tmmis_decision_body_ids'].append({inst['term']: inst['decision_body_id']}) o.extras = extras o.add_source(inst['source_url']) if instances[canonical_i]['name'] != inst['name']: # TODO: Add start_date and end_date o.add_name(inst['name']) yield o
def test_deduplication_error_overlaps(): create_jurisdictions() Organization.objects.create(name='World Wrestling Federation', classification='international', jurisdiction_id='jid1') wildlife = Organization.objects.create(name='World Wildlife Fund', classification='international', jurisdiction_id='jid1') wildlife.other_names.create(name='WWF') org = ScrapeOrganization('World Wrestling Federation', classification='international') org.add_name('WWF') od = org.as_dict() with pytest.raises(SameOrgNameError): OrganizationImporter('jid1').import_data([od])
def test_deduplication_overlap_name_distinct_juris(): create_jurisdictions() org_jid_1 = Organization.objects.create(name='World Wrestling Federation', classification='international', jurisdiction_id='jid1') org_jid_1.other_names.create(name='WWF') org = ScrapeOrganization(name="WWF", classification="international") org.add_name('WWF') oi1 = OrganizationImporter('jid1') oi1.import_item(org.as_dict()) assert Organization.objects.count() == 1 oi2 = OrganizationImporter('jid2') oi2.import_item(org.as_dict()) assert Organization.objects.count() == 2
def test_locked_field_subitem(): org = ScrapeOrganization('SHIELD') org.add_name('S.H.I.E.L.D.') oi = OrganizationImporter('jid') oi.import_data([org.as_dict()]) # lock the field o = Organization.objects.get() o.locked_fields = ['other_names'] o.save() # reimport org = ScrapeOrganization('SHIELD').as_dict() oi = OrganizationImporter('jid') oi.import_data([org]) o = Organization.objects.get() assert o.other_names.get().name == 'S.H.I.E.L.D.'
def test_locked_field_subitem(): create_jurisdiction() org = ScrapeOrganization('SHIELD') org.add_name('S.H.I.E.L.D.') oi = OrganizationImporter('jid') oi.import_data([org.as_dict()]) # lock the field o = Organization.objects.get() o.locked_fields = ['other_names'] o.save() # reimport org = ScrapeOrganization('SHIELD').as_dict() oi = OrganizationImporter('jid') oi.import_data([org]) o = Organization.objects.get() assert o.other_names.get().name == 'S.H.I.E.L.D.'
def dict_to_org(self, committee): names = sorted(committee['name']) first_name = names.pop() if 'chamber' in committee: o = Organization(first_name, classification='committee', chamber=committee['chamber']) else: o = Organization(first_name, classification='committee', parent_id=committee['parent']) for other_name in names: o.add_name(other_name) for code in committee['code']: if code: o.add_name(code) for source in committee['source']: o.add_source(source) self.scrape_members(o, source) return o
def dict_to_org(self, committee): names = sorted(committee["name"]) first_name = names.pop() if "chamber" in committee: o = Organization( first_name, classification="committee", chamber=committee["chamber"] ) else: o = Organization( first_name, classification="committee", parent_id=committee["parent"] ) for other_name in names: o.add_name(other_name) for code in committee["code"]: if code: o.add_name(code) for source in committee["source"]: o.add_source(source) self.scrape_members(o, source) return o
def scrape(self): current_path = Path(__file__) committee_path = current_path.parent / 'congress-legislators/committees-historical.yaml' with committee_path.open() as f: committees = yaml.load(f, Loader=yaml.CLoader) for committee in committees: if committee['type'] == 'house': chamber = 'lower' elif committee['type'] == 'senate': chamber = 'upper' else: print(committee) raise names_int = { int(key): name for key, name in committee['names'].items() } _, current_name = max(names_int.items()) if chamber == 'lower': current_name = 'House Committee on ' + current_name elif chamber == 'upper': current_name = 'Senate Committee on ' + current_name c = Organization(current_name, classification='committee', chamber=chamber) start, end = duration(committee) c.founding_date = start if end: c.dissolution_date = end c.add_identifier(committee['thomas_id'] + '00', scheme='thomas_id') if 'house_committee_id' in committee: c.add_identifier(committee['house_committee_id'], scheme='house_committee_id') if 'senate_committee_id' in committee: c.add_identifier(committee['senate_committee_id'], scheme='senate_committee_id') c.add_source( 'https://github.com/unitedstates/congress-legislators/blob/master/committees-historical.yaml' ) for name in set(committee['names'].values()): if chamber == 'lower': name = 'House Committee on ' + name elif chamber == 'upper': name = 'Senate Committee on ' + name if name != c.name: c.add_name(name) yield c for subcommittee in committee.get('subcommittees', []): names_int = { int(key): name for key, name in subcommittee['names'].items() } _, current_name = max(names_int.items()) sc = Organization('Subcommittee on ' + current_name, classification='committee', parent_id=c) start, end = duration(subcommittee) sc.founding_date = start if end: sc.dissolution_date = end thomas_id = (committee['thomas_id'] + subcommittee['thomas_id']) sc.add_identifier(thomas_id, scheme='thomas_id') sc.add_source( 'https://github.com/unitedstates/congress-legislators/blob/master/committees-historical.yaml' ) if thomas_id == 'SSJU12': sc.add_identifier('SSJU15', scheme='thomas_id') elif thomas_id == 'SSJU15': continue if 'Oversight and Investigations' in sc.name: print(thomas_id) #input() for name in set(subcommittee['names'].values()): name = 'Subcommittee on ' + name if name != sc.name: sc.add_name(name) yield sc