def test_full_person(): person = ScrapePerson('Tom Sawyer') person.add_identifier('1') person.add_name('Tommy', start_date='1880') person.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') person.add_link('http://example.com/link') person.add_source('http://example.com/source') # import person pd = person.as_dict() PersonImporter('jurisdiction-id').import_data([pd]) # get person from db and assert it imported correctly p = Person.objects.get() assert 'ocd-person' in p.id assert p.name == person.name assert p.identifiers.all()[0].identifier == '1' assert p.identifiers.all()[0].scheme == '' assert p.other_names.all()[0].name == 'Tommy' assert p.other_names.all()[0].start_date == '1880' assert p.contact_details.all()[0].type == 'phone' assert p.contact_details.all()[0].value == '555-555-1234' assert p.contact_details.all()[0].note == 'this is fake' assert p.links.all()[0].url == 'http://example.com/link' assert p.sources.all()[0].url == 'http://example.com/source'
def test_deduplication_other_name_overlaps(): create_person() # Person has other_name that overlaps w/ existing name person = ScrapePerson('The Rock') person.add_name('Dwayne Johnson') pd = person.as_dict() PersonImporter('jurisdiction-id').import_data([pd]) assert Person.objects.all().count() == 1
def test_same_name_people_other_name(): # ensure we're taking other_names into account for the name collision code o = Organization.objects.create(name='WWE', jurisdiction_id='jurisdiction-id') p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1') p2 = ScrapePerson('Rock', image='http://example.com/2') p2.add_name('Dwayne Johnson') # the people have the same name but are apparently different with pytest.raises(SameNameError): PersonImporter('jurisdiction-id').import_data([p1.as_dict(), p2.as_dict()])
def test_same_name_people_other_name(): create_jurisdiction() # ensure we're taking other_names into account for the name collision code Organization.objects.create(name='WWE', jurisdiction_id='jid') p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1') p2 = ScrapePerson('Rock', image='http://example.com/2') p2.add_name('Dwayne Johnson') # the people have the same name but are apparently different with pytest.raises(SameNameError): PersonImporter('jid').import_data([p1.as_dict(), p2.as_dict()])
def scrape(self): noncommittees = {'Committee of the Whole'} committee_d = {} people_d = {} # Go to memberlist extra_args = {'ctl00$ContentPlaceHolder$lstName': 'City Council'} for councilman, committees in self.councilMembers( extra_args=extra_args): if 'url' in councilman['Person Name']: councilman_url = councilman['Person Name']['url'] if councilman_url in people_d: people_d[councilman_url][0].append(councilman) else: people_d[councilman_url] = [councilman], committees for person_entries, committees in people_d.values(): councilman = person_entries[-1] p = Person(councilman['Person Name']['label']) if p.name == 'Letitia James': p.name = 'Letitia Ms. James' p.add_name('Letitia James') spans = [(self.toTime(entry['Start Date']).date(), self.toTime(entry['End Date']).date(), entry['District']) for entry in person_entries] merged_spans = [] last_end_date = None last_district = None for start_date, end_date, district in sorted(spans): if last_end_date is None: span = [start_date, end_date, district] elif (start_date - last_end_date ) == datetime.timedelta(1) and district == last_district: span[1] = end_date else: merged_spans.append(span) span = [start_date, end_date, district] last_end_date = end_date last_district = district merged_spans.append(span) for start_date, end_date, district in merged_spans: district = councilman['District'].replace(' 0', ' ') if end_date == datetime.date(2017, 12, 31): end_date = '' else: end_date = end_date.isoformat() print(start_date, end_date) p.add_term('Council Member', 'legislature', district=district, start_date=start_date.isoformat(), end_date=end_date) party = councilman['Political Party'] if party == 'Democrat': party = 'Democratic' if party: p.add_party(party) if councilman['Photo']: p.image = councilman['Photo'] if councilman["E-mail"]: p.add_contact_detail(type="email", value=councilman['E-mail']['url'], note='E-mail') if councilman['Web site']: p.add_link(councilman['Web site']['url'], note='web site') p.extras = {'Notes': councilman['Notes']} p.add_source(councilman['Person Name']['url'], note='web') for committee, _, _ in committees: committee_name = committee['Department Name']['label'] if committee_name not in noncommittees and 'committee' in committee_name.lower( ): o = committee_d.get(committee_name, None) if o is None: parent_id = PARENT_ORGS.get(committee_name, 'New York City Council') o = Organization(committee_name, classification='committee', parent_id={'name': parent_id}) o.add_source(committee['Department Name']['url']) committee_d[committee_name] = o membership = o.add_member(p, role=committee["Title"]) membership.start_date = self.mdY2Ymd( committee["Start Date"]) yield p for o in committee_d.values(): if 'Committee' in o.name: yield o for o in committee_d.values(): if 'Subcommittee' in o.name: yield o o = Organization( 'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services', classification='committee', parent_id={'name': 'New York City Council'}) o.add_source("http://legistar.council.nyc.gov/Departments.aspx") yield o o = Organization( 'Subcommittee on Drug Abuse', classification='committee', parent_id={ 'name': 'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services' }) o.add_source("http://legistar.council.nyc.gov/Departments.aspx") yield o
def scrape(self): noncommittees = {'Committee of the Whole'} committee_d = {} people_d = {} for councilman, committees in self.councilMembers() : if 'url' in councilman['Person Name'] : councilman_url = councilman['Person Name']['url'] if councilman_url in people_d : people_d[councilman_url][0].append(councilman) else : people_d[councilman_url] = [councilman], committees for person_entries, committees in people_d.values() : councilman = person_entries[-1] p = Person(councilman['Person Name']['label']) if p.name == 'Letitia James' : p.name = 'Letitia Ms. James' p.add_name('Letitia James') spans = [(self.toTime(entry['Start Date']).date(), self.toTime(entry['End Date']).date(), entry['District']) for entry in person_entries] merged_spans = [] last_end_date = None last_district = None for start_date, end_date, district in sorted(spans) : if last_end_date is None : span = [start_date, end_date, district] elif (start_date - last_end_date) == datetime.timedelta(1) and district == last_district : span[1] = end_date else : merged_spans.append(span) span = [start_date, end_date, district] last_end_date = end_date last_district = district merged_spans.append(span) for start_date, end_date, district in merged_spans : district = councilman['District'].replace(' 0', ' ') if end_date == datetime.date(2017, 12, 31) : end_date = '' else : end_date = end_date.isoformat() print(start_date, end_date) p.add_term('Council Member', 'legislature', district=district, start_date=start_date.isoformat(), end_date=end_date) party = councilman['Political Party'] if party == 'Democrat' : party = 'Democratic' if party : p.add_party(party) if councilman['Photo'] : p.image = councilman['Photo'] if councilman["E-mail"]: p.add_contact_detail(type="email", value=councilman['E-mail']['url'], note='E-mail') if councilman['Web site']: p.add_link(councilman['Web site']['url'], note='web site') p.extras = {'Notes' : councilman['Notes']} p.add_source(councilman['Person Name']['url'], note='web') for committee, _, _ in committees: committee_name = committee['Department Name']['label'] if committee_name not in noncommittees and 'committee' in committee_name.lower(): o = committee_d.get(committee_name, None) if o is None: parent_id = PARENT_ORGS.get(committee_name, 'New York City Council') o = Organization(committee_name, classification='committee', parent_id={'name' : parent_id}) o.add_source(committee['Department Name']['url']) committee_d[committee_name] = o membership = o.add_member(p, role=committee["Title"]) membership.start_date = self.mdY2Ymd(committee["Start Date"]) yield p for o in committee_d.values() : if 'Committee' in o.name : yield o for o in committee_d.values() : if 'Subcommittee' in o.name : yield o o = Organization('Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services', classification='committee', parent_id={'name' : 'New York City Council'}) o.add_source("http://legistar.council.nyc.gov/Departments.aspx") yield o o = Organization('Subcommittee on Drug Abuse', classification='committee', parent_id={'name' : 'Committee on Mental Health, Developmental Disability, Alcoholism, Drug Abuse and Disability Services'}) o.add_source("http://legistar.council.nyc.gov/Departments.aspx") yield o