def do_import(stream, stransaction): stream = list(stream) jurisdiction_id = stransaction.jurisdiction.id org_importer = OrganizationImporter(jurisdiction_id) person_importer = PersonImporter(jurisdiction_id) post_importer = PostImporter(jurisdiction_id, org_importer) membership_importer = MembershipImporter( jurisdiction_id, person_importer, org_importer, post_importer ) report = {} def tfilter(otype, stream): for el in filter(lambda x: isinstance(x, otype), stream): yield el.as_dict() with transaction.atomic(): report.update(org_importer.import_data(tfilter(Organization, stream))) report.update(person_importer.import_data(tfilter(Person, stream))) report.update(post_importer.import_data(tfilter(Post, stream))) report.update(membership_importer.import_data( tfilter(Membership, stream))) return report
def test_multiple_orgs_of_same_class(): """ We should be able to set memberships on organizations with the same classification within the same jurisdictions """ Organization.objects.create(id="fnd", name="Foundation", classification="foundation", jurisdiction_id="fnd-jid") Organization.objects.create(id="fdr", name="Federation", classification="foundation", jurisdiction_id="fnd-jid") hari = ScrapePerson('Hari Seldon', primary_org='foundation', role='founder', primary_org_name='Foundation') picard = ScrapePerson('Jean Luc Picard', primary_org='foundation', role='founder', primary_org_name='Federation') person_imp = PersonImporter('fnd-jid') person_imp.import_data([hari.as_dict()]) person_imp.import_data([picard.as_dict()]) # try to import a membership org_imp = OrganizationImporter('fnd-jid') dumb_imp = DumbMockImporter() memimp = MembershipImporter('fnd-jid', person_imp, org_imp, dumb_imp) memimp.import_data([hari._related[0].as_dict(), picard._related[0].as_dict()]) assert Person.objects.get(name='Hari Seldon').memberships.get().organization.name == 'Foundation' assert Person.objects.get(name='Jean Luc Picard').memberships.get().organization.name == 'Federation'
def do_import(stream, transaction): stream = list(stream) jurisdiction_id = transaction.jurisdiction.id juris_importer = JurisdictionImporter(jurisdiction_id) org_importer = OrganizationImporter(jurisdiction_id) person_importer = PersonImporter(jurisdiction_id) post_importer = PostImporter(jurisdiction_id, org_importer) membership_importer = MembershipImporter( jurisdiction_id, person_importer, org_importer, post_importer ) report = {} # This basically relates to Pupa's pupa.clu.commands.update:113 # (From there - wrap this in a transaction.) def tfilter(otype, stream): for el in filter(lambda x: isinstance(x, otype), stream): yield el.as_dict() report.update(juris_importer.import_data(tfilter(Jurisdiction, stream))) report.update(org_importer.import_data(tfilter(Organization, stream))) report.update(person_importer.import_data(tfilter(Person, stream))) report.update(post_importer.import_data(tfilter(Post, stream))) report.update(membership_importer.import_data( tfilter(Membership, stream))) return report
def test_bill_sponsor_by_identifier(): create_jurisdiction() org = create_org() bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', chamber='lower') bill.add_sponsorship_by_identifier(name="SNODGRASS", classification='sponsor', entity_type='person', primary=True, identifier="TOTALLY_REAL_ID", scheme="TOTALLY_REAL_SCHEME") oi = OrganizationImporter('jid') pi = PersonImporter('jid') zs = ScrapePerson(name='Zadock Snodgrass') zs.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') pi.import_data([zs.as_dict()]) za_db = Person.objects.get() Membership.objects.create(person_id=za_db.id, organization_id=org.id) BillImporter('jid', oi, pi).import_data([bill.as_dict()]) obj = Bill.objects.get() (entry,) = obj.sponsorships.all() assert entry.person.name == "Zadock Snodgrass"
def do_import(self, juris, args): datadir = os.path.join(settings.SCRAPED_DATA_DIR, args.module) juris_importer = JurisdictionImporter(juris.jurisdiction_id) org_importer = OrganizationImporter(juris.jurisdiction_id) person_importer = PersonImporter(juris.jurisdiction_id) post_importer = PostImporter(juris.jurisdiction_id, org_importer) membership_importer = MembershipImporter(juris.jurisdiction_id, person_importer, org_importer, post_importer) #bill_importer = BillImporter(juris.jurisdiction_id, org_importer) #vote_importer = VoteImporter(juris.jurisdiction_id, person_importer, org_importer, # bill_importer) #event_importer = EventImporter(juris.jurisdiction_id) report = {} # TODO: wrap in a transaction report.update(juris_importer.import_directory(datadir)) report.update(org_importer.import_directory(datadir)) report.update(person_importer.import_directory(datadir)) report.update(post_importer.import_directory(datadir)) report.update(membership_importer.import_directory(datadir)) #report.update(bill_importer.import_from_json(datadir)) #report.update(event_importer.import_from_json(datadir)) #report.update(vote_importer.import_from_json(datadir)) return report
def do_import(self, juris, args): # import inside here because to avoid loading Django code unnecessarily from pupa.importers import (JurisdictionImporter, OrganizationImporter, PersonImporter, PostImporter, MembershipImporter, BillImporter, VoteEventImporter, EventImporter) from pupa.reports import generate_session_report from pupa.models import SessionDataQualityReport datadir = os.path.join(settings.SCRAPED_DATA_DIR, args.module) juris_importer = JurisdictionImporter(juris.jurisdiction_id) org_importer = OrganizationImporter(juris.jurisdiction_id) person_importer = PersonImporter(juris.jurisdiction_id) post_importer = PostImporter(juris.jurisdiction_id, org_importer) membership_importer = MembershipImporter(juris.jurisdiction_id, person_importer, org_importer, post_importer) bill_importer = BillImporter(juris.jurisdiction_id, org_importer, person_importer) vote_event_importer = VoteEventImporter(juris.jurisdiction_id, person_importer, org_importer, bill_importer) event_importer = EventImporter(juris.jurisdiction_id, org_importer, person_importer, bill_importer, vote_event_importer) report = {} with transaction.atomic(): print('import jurisdictions...') report.update(juris_importer.import_directory(datadir)) if settings.ENABLE_PEOPLE_AND_ORGS: print('import organizations...') report.update(org_importer.import_directory(datadir)) print('import people...') report.update(person_importer.import_directory(datadir)) print('import posts...') report.update(post_importer.import_directory(datadir)) print('import memberships...') report.update(membership_importer.import_directory(datadir)) if settings.ENABLE_BILLS: print('import bills...') report.update(bill_importer.import_directory(datadir)) if settings.ENABLE_EVENTS: print('import events...') report.update(event_importer.import_directory(datadir)) if settings.ENABLE_VOTES: print('import vote events...') report.update(vote_event_importer.import_directory(datadir)) # compile info on all sessions that were updated in this run seen_sessions = set() seen_sessions.update(bill_importer.get_seen_sessions()) seen_sessions.update(vote_event_importer.get_seen_sessions()) for session in seen_sessions: new_report = generate_session_report(session) with transaction.atomic(): SessionDataQualityReport.objects.filter(legislative_session=session).delete() new_report.save() return report
def test_resolve_json_id(): o = Organization.objects.create(name='WWE', jurisdiction_id='jurisdiction-id') p = Person.objects.create(name='Dwayne Johnson') p.other_names.create(name='Rock') p.memberships.create(organization=o) pi = PersonImporter('jurisdiction-id') assert pi.resolve_json_id('~{"name": "Dwayne Johnson"}') == p.id assert pi.resolve_json_id('~{"name": "Rock"}') == p.id
def do_import(self, juris, args): # import inside here because to avoid loading Django code unnecessarily from pupa.importers import (JurisdictionImporter, OrganizationImporter, PersonImporter, PostImporter, MembershipImporter, BillImporter, VoteImporter, EventImporter, DisclosureImporter) datadir = os.path.join(settings.SCRAPED_DATA_DIR, args.module) juris_importer = JurisdictionImporter(juris.jurisdiction_id) org_importer = OrganizationImporter(juris.jurisdiction_id) person_importer = PersonImporter(juris.jurisdiction_id) post_importer = PostImporter(juris.jurisdiction_id, org_importer) membership_importer = MembershipImporter(juris.jurisdiction_id, person_importer, org_importer, post_importer) bill_importer = BillImporter(juris.jurisdiction_id, org_importer, person_importer) vote_importer = VoteImporter(juris.jurisdiction_id, person_importer, org_importer, bill_importer) event_importer = EventImporter(juris.jurisdiction_id, org_importer, person_importer) disclosure_importer = DisclosureImporter(juris.jurisdiction_id, org_importer, person_importer, event_importer) report = {} with transaction.atomic(): print('import jurisdictions...') report.update(juris_importer.import_directory(datadir)) print('import organizations...') report.update(org_importer.import_directory(datadir)) print('import people...') report.update(person_importer.import_directory(datadir)) print('import posts...') report.update(post_importer.import_directory(datadir)) print('import memberships...') report.update(membership_importer.import_directory(datadir)) print('import bills...') report.update(bill_importer.import_directory(datadir)) print('import events...') report.update(event_importer.import_directory(datadir)) print('import disclosures...') report.update(disclosure_importer.import_directory(datadir)) print('import votes...') report.update(vote_importer.import_directory(datadir)) return report
def test_no_membership_for_person(): org = Organization.objects.create(id="fnd", name="Foundation", classification="foundation", jurisdiction_id="fnd-jid") # import a person with no memberships p = ScrapePerson('a man without a country') person_imp = PersonImporter('fnd-jid') person_imp.import_data([p.as_dict()]) # try to import a membership dumb_imp = DumbMockImporter() memimp = MembershipImporter('fnd-jid', person_imp, dumb_imp, dumb_imp) with pytest.raises(NoMembershipsError): memimp.import_data([])
def test_full_vote_event(): j = Jurisdiction.objects.create(id='jid', division_id='did') j.legislative_sessions.create(name='1900', identifier='1900') sp1 = ScrapePerson('John Smith', primary_org='lower') sp2 = ScrapePerson('Adam Smith', primary_org='lower') org = ScrapeOrganization(name='House', classification='lower') bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org._id) vote_event = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-01', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', organization=org._id) vote_event.set_count('yes', 20) vote_event.yes('John Smith') vote_event.no('Adam Smith') oi = OrganizationImporter('jid') oi.import_data([org.as_dict()]) pi = PersonImporter('jid') pi.import_data([sp1.as_dict(), sp2.as_dict()]) mi = MembershipImporter('jid', pi, oi, DumbMockImporter()) mi.import_data([sp1._related[0].as_dict(), sp2._related[0].as_dict()]) bi = BillImporter('jid', oi, pi) bi.import_data([bill.as_dict()]) VoteEventImporter('jid', pi, oi, bi).import_data([vote_event.as_dict()]) assert VoteEvent.objects.count() == 1 ve = VoteEvent.objects.get() assert ve.legislative_session == LegislativeSession.objects.get() assert ve.motion_classification == ['passage:bill'] assert ve.bill == Bill.objects.get() count = ve.counts.get() assert count.option == 'yes' assert count.value == 20 votes = list(ve.votes.all()) assert len(votes) == 2 for v in ve.votes.all(): if v.voter_name == 'John Smith': assert v.option == 'yes' assert v.voter == Person.objects.get(name='John Smith') else: assert v.option == 'no' assert v.voter == Person.objects.get(name='Adam Smith')
def test_no_membership_for_person_including_party(): """ even though party is specified we should still get a no memberships error because it doesn't bind the person to a jurisdiction, thus causing duplication """ org = Organization.objects.create(id="fnd", name="Foundation", classification="foundation", jurisdiction_id="fnd-jid") org = Organization.objects.create(id="dem", name="Democratic", classification="party") # import a person with no memberships p = ScrapePerson('a man without a country', party='Democratic') person_imp = PersonImporter('fnd-jid') person_imp.import_data([p.as_dict()]) # try to import a membership dumb_imp = DumbMockImporter() memimp = MembershipImporter('fnd-jid', person_imp, dumb_imp, dumb_imp) with pytest.raises(NoMembershipsError): memimp.import_data([p._related[0].as_dict()])
def test_bill_sponsor_limit_lookup(): create_jurisdiction() org = create_org() bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', chamber='lower') bill.add_sponsorship_by_identifier(name="SNODGRASS", classification='sponsor', entity_type='person', primary=True, identifier="TOTALLY_REAL_ID", scheme="TOTALLY_REAL_SCHEME") oi = OrganizationImporter('jid') pi = PersonImporter('jid') zs = ScrapePerson(name='Zadock Snodgrass', birth_date="1800-01-01") zs.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') pi.import_data([zs.as_dict()]) za_db = Person.objects.get() Membership.objects.create(person_id=za_db.id, organization_id=org.id) zs2 = ScrapePerson(name='Zadock Snodgrass', birth_date="1900-01-01") zs2.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') # This is contrived and perhaps broken, but we're going to check this. # We *really* don't want to *ever* cross jurisdiction bounds. PersonImporter('another-jurisdiction').import_data([zs.as_dict()]) BillImporter('jid', oi, pi).import_data([bill.as_dict()]) obj = Bill.objects.get() (entry, ) = obj.sponsorships.all() assert entry.person.name == "Zadock Snodgrass" assert entry.person.birth_date == "1800-01-01"
def test_same_name_second_import(): # ensure two people with the same name don't import without birthdays o = Organization.objects.create(name='WWE', jurisdiction_id='jurisdiction-id') p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1') p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2') p1.birth_date = '1970' p2.birth_date = '1930' # when we give them birth dates all is well though resp = PersonImporter('jurisdiction-id').import_data( [p1.as_dict(), p2.as_dict()]) # fake some memberships so future lookups work on these people for p in Person.objects.all(): Membership.objects.create(person=p, organization=o) p3 = ScrapePerson('Dwayne Johnson', image='http://example.com/3') with pytest.raises(SameNameError): resp = PersonImporter('jurisdiction-id').import_data([p3.as_dict()])
def test_same_name_people_other_name(): # ensure we're taking other_names into account for the name collision code o = Organization.objects.create(name='WWE', jurisdiction_id='jurisdiction-id') p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1') p2 = ScrapePerson('Rock', image='http://example.com/2') p2.add_name('Dwayne Johnson') # the people have the same name but are apparently different with pytest.raises(SameNameError): PersonImporter('jurisdiction-id').import_data( [p1.as_dict(), p2.as_dict()])
def test_bill_chamber_param(): create_jurisdiction() org = create_org() bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', chamber='lower') oi = OrganizationImporter('jid') pi = PersonImporter('jid') BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert Bill.objects.get().from_organization_id == org.id
def test_bill_update_subsubitem(): create_jurisdiction() create_org() oi = OrganizationImporter('jid') pi = PersonImporter('jid') # initial sub-subitem bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['insert'] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 1 # a second subsubitem, update bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') bill.add_version_link('printing', 'http://example.com/test.text', media_type='text/plain') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['update'] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 2 # same thing, noop bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') bill.add_version_link('printing', 'http://example.com/test.text', media_type='text/plain') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['noop'] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 2 # different link for second one, update bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') bill.add_version_link('printing', 'http://example.com/diff-link.txt', media_type='text/plain') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['update'] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 2 # delete one, update bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['update'] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 1
def test_multiple_orgs_of_same_class(): """ We should be able to set memberships on organizations with the same classification within the same jurisdictions """ create_jurisdiction() Organization.objects.create(id="fnd", name="Foundation", classification="foundation", jurisdiction_id="fnd-jid") Organization.objects.create(id="fdr", name="Federation", classification="foundation", jurisdiction_id="fnd-jid") hari = ScrapePerson('Hari Seldon', primary_org='foundation', role='founder', primary_org_name='Foundation') picard = ScrapePerson('Jean Luc Picard', primary_org='foundation', role='founder', primary_org_name='Federation') person_imp = PersonImporter('fnd-jid') person_imp.import_data([hari.as_dict()]) person_imp.import_data([picard.as_dict()]) # try to import a membership org_imp = OrganizationImporter('fnd-jid') dumb_imp = DumbMockImporter() memimp = MembershipImporter('fnd-jid', person_imp, org_imp, dumb_imp) memimp.import_data([hari._related[0].as_dict(), picard._related[0].as_dict()]) assert Person.objects.get(name='Hari Seldon').memberships.get().organization.name == 'Foundation' assert Person.objects.get(name='Jean Luc Picard').memberships.get().organization.name == 'Federation'
def do_import(self, juris, args): # import inside here because to avoid loading Django code unnecessarily from pupa.importers import (JurisdictionImporter, OrganizationImporter, PersonImporter, PostImporter, MembershipImporter, BillImporter, VoteImporter, EventImporter) datadir = os.path.join(settings.SCRAPED_DATA_DIR, args.module) juris_importer = JurisdictionImporter(juris.jurisdiction_id) org_importer = OrganizationImporter(juris.jurisdiction_id) person_importer = PersonImporter(juris.jurisdiction_id) post_importer = PostImporter(juris.jurisdiction_id, org_importer) membership_importer = MembershipImporter(juris.jurisdiction_id, person_importer, org_importer, post_importer) bill_importer = BillImporter(juris.jurisdiction_id, org_importer) vote_importer = VoteImporter(juris.jurisdiction_id, person_importer, org_importer, bill_importer) event_importer = EventImporter(juris.jurisdiction_id) report = {} with transaction.atomic(): print('import jurisdictions...') report.update(juris_importer.import_directory(datadir)) print('import organizations...') report.update(org_importer.import_directory(datadir)) print('import people...') report.update(person_importer.import_directory(datadir)) print('import posts...') report.update(post_importer.import_directory(datadir)) print('import memberships...') report.update(membership_importer.import_directory(datadir)) print('import bills...') report.update(bill_importer.import_directory(datadir)) print('import events...') report.update(event_importer.import_directory(datadir)) print('import votes...') report.update(vote_importer.import_directory(datadir)) return report
def test_no_membership_for_person_including_party(): """ even though party is specified we should still get a no memberships error because it doesn't bind the person to a jurisdiction, thus causing duplication """ create_jurisdiction() Organization.objects.create(id="fnd", name="Foundation", classification="foundation", jurisdiction_id="fnd-jid") Organization.objects.create(id="dem", name="Democratic", classification="party") # import a person with no memberships p = ScrapePerson('a man without a country', party='Democratic') person_imp = PersonImporter('fnd-jid') org_imp = OrganizationImporter('fnd-jid') person_imp.import_data([p.as_dict()]) # try to import a membership dumb_imp = DumbMockImporter() memimp = MembershipImporter('fnd-jid', person_imp, org_imp, dumb_imp) with pytest.raises(NoMembershipsError): memimp.import_data([p._related[0].as_dict()])
def test_bill_sponsor_limit_lookup(): create_jurisdiction() org = create_org() bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', chamber='lower') bill.add_sponsorship_by_identifier(name="SNODGRASS", classification='sponsor', entity_type='person', primary=True, identifier="TOTALLY_REAL_ID", scheme="TOTALLY_REAL_SCHEME") oi = OrganizationImporter('jid') pi = PersonImporter('jid') zs = ScrapePerson(name='Zadock Snodgrass', birth_date="1800-01-01") zs.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') pi.import_data([zs.as_dict()]) za_db = Person.objects.get() Membership.objects.create(person_id=za_db.id, organization_id=org.id) zs2 = ScrapePerson(name='Zadock Snodgrass', birth_date="1900-01-01") zs2.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') # This is contrived and perhaps broken, but we're going to check this. # We *really* don't want to *ever* cross jurisdiction bounds. PersonImporter('another-jurisdiction').import_data([zs.as_dict()]) BillImporter('jid', oi, pi).import_data([bill.as_dict()]) obj = Bill.objects.get() (entry,) = obj.sponsorships.all() assert entry.person.name == "Zadock Snodgrass" assert entry.person.birth_date == "1800-01-01"
def test_resolve_json_id(): p1 = ScrapePerson('Dwayne').as_dict() p2 = ScrapePerson('Dwayne').as_dict() pi = PersonImporter('jid') # do import and get database id p1_id = p1['_id'] p2_id = p2['_id'] pi.import_data([p1, p2]) db_id = Person.objects.get().id # simplest case assert pi.resolve_json_id(p1_id) == db_id # duplicate should resolve to same id assert pi.resolve_json_id(p2_id) == db_id # a null id should map to None assert pi.resolve_json_id(None) is None # no such id with pytest.raises(UnresolvedIdError): pi.resolve_json_id('this-is-invalid')
def do_import(stream, stransaction): stream = list(stream) jurisdiction_id = stransaction.jurisdiction.id org_importer = OrganizationImporter(jurisdiction_id) person_importer = PersonImporter(jurisdiction_id) post_importer = PostImporter(jurisdiction_id, org_importer) membership_importer = MembershipImporter(jurisdiction_id, person_importer, org_importer, post_importer) report = {} def tfilter(otype, stream): for el in filter(lambda x: isinstance(x, otype), stream): yield el.as_dict() with transaction.atomic(): report.update(org_importer.import_data(tfilter(Organization, stream))) report.update(person_importer.import_data(tfilter(Person, stream))) report.update(post_importer.import_data(tfilter(Post, stream))) report.update( membership_importer.import_data(tfilter(Membership, stream))) return report
def test_multiple_memberships(): create_jurisdiction() # there was a bug where two or more memberships to the same jurisdiction # would cause an ORM error, this test ensures that it is fixed p = Person.objects.create(name='Dwayne Johnson') o = Organization.objects.create(name='WWE', jurisdiction_id='jid') Membership.objects.create(person=p, organization=o) o = Organization.objects.create(name='WWF', jurisdiction_id='jid') Membership.objects.create(person=p, organization=o) person = ScrapePerson('Dwayne Johnson') pd = person.as_dict() PersonImporter('jid').import_data([pd]) # deduplication should still work assert Person.objects.all().count() == 1
def test_same_name_people(): o = Organization.objects.create(name='WWE', jurisdiction_id='jurisdiction-id') # importing two people with the same name to a pristine database should error p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1') p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2') with pytest.raises(SameNameError): PersonImporter('jurisdiction-id').import_data( [p1.as_dict(), p2.as_dict()]) # importing one person should pass PersonImporter('jurisdiction-id').import_data([p1.as_dict()]) # create fake memberships so that future lookups work on the imported people for p in Person.objects.all(): Membership.objects.create(person=p, organization=o) # importing another person with the same name should fail with pytest.raises(SameNameError): PersonImporter('jurisdiction-id').import_data( [p1.as_dict(), p2.as_dict()]) # adding birth dates should pass p1.birth_date = '1970' p2.birth_date = '1930' resp = PersonImporter('jurisdiction-id').import_data( [p1.as_dict(), p2.as_dict()]) assert resp['person']['insert'] == 1 assert resp['person']['noop'] == 0 assert resp['person']['update'] == 1 assert Person.objects.count() == 2 # create fake memberships so that future lookups work on the imported people for p in Person.objects.all(): Membership.objects.create(person=p, organization=o) # adding a third person with the same name but without a birthday should error p3 = ScrapePerson('Dwayne Johnson', image='http://example.com/3') with pytest.raises(SameNameError): PersonImporter('jurisdiction-id').import_data([p3.as_dict()]) # and now test that an update works and we can insert a new one with the same name p1.image = 'http://example.com/1.jpg' p2.birth_date = '1931' # change birth_date, means a new insert resp = PersonImporter('jurisdiction-id').import_data( [p1.as_dict(), p2.as_dict()]) assert Person.objects.count() == 3 assert resp['person']['insert'] == 1 assert resp['person']['noop'] == 0 assert resp['person']['update'] == 1
def test_bill_update(): create_jurisdiction() create_org() bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') oi = OrganizationImporter('jid') pi = PersonImporter('jid') _, what = BillImporter('jid', oi, pi).import_item(bill.as_dict()) assert what == 'insert' _, what = BillImporter('jid', oi, pi).import_item(bill.as_dict()) assert what == 'noop' # ensure no new object was created assert Bill.objects.count() == 1 # test basic update bill = ScrapeBill('HB 1', '1900', '1st Bill', chamber='lower') _, what = BillImporter('jid', oi, pi).import_item(bill.as_dict()) assert what == 'update' assert Bill.objects.get().title == '1st Bill'
def test_bill_action_extras(): create_jurisdiction() create_org() bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', chamber='lower') bill.add_action('sample', '1900-01-01', chamber='lower', extras={'test': 3}) oi = OrganizationImporter('jid') pi = PersonImporter('jid') BillImporter('jid', oi, pi).import_data([bill.as_dict()]) b = Bill.objects.get() assert b.actions.all()[0].extras == {'test': 3}
def test_resolve_json_id_multiple_family_name(): create_jurisdiction() o = Organization.objects.create(name='WWE', jurisdiction_id='jid') p1 = Person.objects.create(name='Dwayne Johnson', family_name='Johnson') p1.other_names.create(name='Rock') p2 = Person.objects.create(name='Adam Johnson', family_name='Johnson') for p in Person.objects.all(): Membership.objects.create(person=p, organization=o) # If there are multiple people with a family name, full name/other name # lookups should work but family name lookups should fail. pi = PersonImporter('jid') assert pi.resolve_json_id('~{"name": "Dwayne Johnson"}') == p1.id assert pi.resolve_json_id('~{"name": "Adam Johnson"}') == p2.id with pytest.raises(UnresolvedIdError): pi.resolve_json_id('~{"name": "Johnson"}')
def test_fix_bill_id(): create_jurisdiction() create_org() bill = ScrapeBill('HB1', '1900', 'Test Bill ID', classification='bill', chamber='lower') oi = OrganizationImporter('jid') pi = PersonImporter('jid') from pupa.settings import IMPORT_TRANSFORMERS IMPORT_TRANSFORMERS['bill'] = { 'identifier': lambda x: re.sub(r'([A-Z]*)\s*0*([-\d]+)', r'\1 \2', x, 1) } bi = BillImporter('jid', oi, pi) bi.import_data([bill.as_dict()]) IMPORT_TRANSFORMERS['bill'] = {} b = Bill.objects.get() assert b.identifier == 'HB 1'
def test_full_bill(): create_jurisdiction() person = Person.objects.create(id='person-id', name='Adam Smith') org = ScrapeOrganization(name='House', classification='lower') com = ScrapeOrganization(name='Arbitrary Committee', classification='committee', parent_id=org._id) oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill.subject = ['taxes', 'axes'] bill.add_identifier('SB 9') bill.add_title('Tack & Axe Tax Act') bill.add_action('introduced in house', '1900-04-01', chamber='lower') act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower') act.add_related_entity('arbitrary committee', 'organization', com._id) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person', primary=False, entity_id=person.id) bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person', primary=True) bill.add_abstract('This is an act about axes and taxes and tacks.', note="official") bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf', media_type='application/pdf') bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html') bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html') bill.add_source('http://example.com/source') # import bill oi = OrganizationImporter('jid') oi.import_data([org.as_dict(), com.as_dict()]) pi = PersonImporter('jid') pi.json_to_db_id['person-id'] = 'person-id' # Since we have to create this person behind the back of the import # transaction, we'll fake the json-id to db-id, since they match in this # case. This is *really* getting at some implementation detail, but it's # the cleanest way to ensure we short-circut the json id lookup. BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier='HB 1') assert b.from_organization.classification == 'lower' assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ['taxes', 'axes'] assert b.abstracts.get().note == 'official' # other_title, other_identifier added assert b.other_titles.get().title == 'Tack & Axe Tax Act' assert b.other_identifiers.get().identifier == 'SB 9' # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get(classification='lower') assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert (actions[1].related_entities.get().organization == Organization.objects.get(classification='committee')) # related_bills were added rb = b.related_bills.get() assert rb.identifier == 'HB 99' # and bill got resolved assert rb.related_bill.identifier == 'HB 99' # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1
def test_full_bill(): create_jurisdiction() sp = ScrapePerson('Adam Smith') org = ScrapeOrganization(name='House', classification='lower') com = ScrapeOrganization(name='Arbitrary Committee', classification='committee', parent_id=org._id) oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill.subject = ['taxes', 'axes'] bill.add_identifier('SB 9') bill.add_title('Tack & Axe Tax Act') bill.add_action('introduced in house', '1900-04-01', chamber='lower') act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower') act.add_related_entity('arbitrary committee', 'organization', com._id) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person', primary=False, entity_id=sp._id) bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person', primary=True) bill.add_abstract('This is an act about axes and taxes and tacks.', note="official", date='1969-10-20') bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf', media_type='application/pdf') bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html') bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html') bill.add_source('http://example.com/source') # import bill oi = OrganizationImporter('jid') oi.import_data([org.as_dict(), com.as_dict()]) pi = PersonImporter('jid') pi.import_data([sp.as_dict()]) BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier='HB 1') assert b.from_organization.classification == 'lower' assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ['taxes', 'axes'] assert b.abstracts.get().note == 'official' assert b.abstracts.get().date == '1969-10-20' # other_title, other_identifier added assert b.other_titles.get().title == 'Tack & Axe Tax Act' assert b.other_identifiers.get().identifier == 'SB 9' # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get( classification='lower') assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert (actions[1].related_entities.get().organization == Organization.objects.get(classification='committee')) # related_bills were added rb = b.related_bills.get() assert rb.identifier == 'HB 99' # and bill got resolved assert rb.related_bill.identifier == 'HB 99' # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 person = Person.objects.get(name='Adam Smith') for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1
def test_bill_update_because_of_subitem(): create_jurisdiction() create_org() oi = OrganizationImporter('jid') pi = PersonImporter('jid') # initial bill bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_action('this is an action', chamber='lower', date='1900-01-01') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['insert'] == 1 obj = Bill.objects.get() assert obj.actions.count() == 1 last_updated = obj.updated_at # now let's make sure we get updated when there are second actions bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_action('this is an action', chamber='lower', date='1900-01-01') bill.add_action('this is a second action', chamber='lower', date='1900-01-02') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['update'] == 1 obj = Bill.objects.get() assert obj.actions.count() == 2 assert obj.updated_at > last_updated # same 2 actions, noop bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_action('this is an action', chamber='lower', date='1900-01-01') bill.add_action('this is a second action', chamber='lower', date='1900-01-02') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['noop'] == 1 obj = Bill.objects.get() assert obj.actions.count() == 2 # same 2 actions, different order, update bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_action('this is a second action', chamber='lower', date='1900-01-02') bill.add_action('this is an action', chamber='lower', date='1900-01-01') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['update'] == 1 obj = Bill.objects.get() assert obj.actions.count() == 2 # different 2 actions, update bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_action('this is an action', chamber='lower', date='1900-01-01') bill.add_action('this is a different second action', chamber='lower', date='1900-01-02') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['update'] == 1 obj = Bill.objects.get() assert obj.actions.count() == 2 # delete an action, update bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_action('this is a second action', chamber='lower', date='1900-01-02') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['update'] == 1 obj = Bill.objects.get() assert obj.actions.count() == 1 # delete all actions, update bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['update'] == 1 obj = Bill.objects.get() assert obj.actions.count() == 0 # and back to initial status, update bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_action('this is an action', chamber='lower', date='1900-01-01') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['update'] == 1 obj = Bill.objects.get() assert obj.actions.count() == 1
Division.objects.create(id='ocd-division/country:ca', name='USA') j = Jurisdiction.objects.create(id='ojid', division_id='ocd-division/country:ca') return j def ge(): event = ScrapeEvent( name="America's Birthday", start_date="2014-07-04T05:00Z", location_name="America", all_day=True) return event oi = OrganizationImporter('jid') pi = PersonImporter('jid') bi = BillImporter('jid', oi, pi) vei = VoteEventImporter('jid', pi, oi, bi) @pytest.mark.django_db def test_related_people_event(): create_jurisdiction() george = Person.objects.create(id='gw', name='George Washington') john = Person.objects.create(id='jqp', name='John Q. Public') o = Organization.objects.create(name='Merica', jurisdiction_id='jid') Membership.objects.create(person=george, organization=o) Membership.objects.create(person=john, organization=o) event1 = ge()
def test_invalid_fields(): p1 = ScrapePerson('Dwayne').as_dict() p1['newfield'] = "shouldn't happen" with pytest.raises(DataImportError): PersonImporter('jid').import_data([p1])
def test_deduplication_identical_object(): p1 = ScrapePerson('Dwayne').as_dict() p2 = ScrapePerson('Dwayne').as_dict() PersonImporter('jid').import_data([p1, p2]) assert Person.objects.count() == 1
def do_import(self, juris, args): # import inside here because to avoid loading Django code unnecessarily from pupa.importers import (JurisdictionImporter, OrganizationImporter, PersonImporter, PostImporter, MembershipImporter, BillImporter, VoteEventImporter, EventImporter) from pupa.reports import generate_session_report from pupa.models import SessionDataQualityReport datadir = os.path.join(settings.SCRAPED_DATA_DIR, args.module) juris_importer = JurisdictionImporter(juris.jurisdiction_id) org_importer = OrganizationImporter(juris.jurisdiction_id) person_importer = PersonImporter(juris.jurisdiction_id) post_importer = PostImporter(juris.jurisdiction_id, org_importer) membership_importer = MembershipImporter(juris.jurisdiction_id, person_importer, org_importer, post_importer) bill_importer = BillImporter(juris.jurisdiction_id, org_importer, person_importer) vote_event_importer = VoteEventImporter(juris.jurisdiction_id, person_importer, org_importer, bill_importer) event_importer = EventImporter(juris.jurisdiction_id, org_importer, person_importer, bill_importer, vote_event_importer) report = {} with transaction.atomic(): print('import jurisdictions...') report.update(juris_importer.import_directory(datadir)) if settings.ENABLE_PEOPLE_AND_ORGS: print('import organizations...') report.update(org_importer.import_directory(datadir)) print('import people...') report.update(person_importer.import_directory(datadir)) print('import posts...') report.update(post_importer.import_directory(datadir)) print('import memberships...') report.update(membership_importer.import_directory(datadir)) if settings.ENABLE_BILLS: print('import bills...') report.update(bill_importer.import_directory(datadir)) if settings.ENABLE_EVENTS: print('import events...') report.update(event_importer.import_directory(datadir)) if settings.ENABLE_VOTES: print('import vote events...') report.update(vote_event_importer.import_directory(datadir)) # compile info on all sessions that were updated in this run seen_sessions = set() seen_sessions.update(bill_importer.get_seen_sessions()) seen_sessions.update(vote_event_importer.get_seen_sessions()) for session in seen_sessions: new_report = generate_session_report(session) with transaction.atomic(): SessionDataQualityReport.objects.filter( legislative_session=session).delete() new_report.save() return report