def do_import(stream, stransaction): stream = list(stream) jurisdiction_id = stransaction.jurisdiction.id org_importer = OrganizationImporter(jurisdiction_id) person_importer = PersonImporter(jurisdiction_id) post_importer = PostImporter(jurisdiction_id, org_importer) membership_importer = MembershipImporter( jurisdiction_id, person_importer, org_importer, post_importer ) report = {} def tfilter(otype, stream): for el in filter(lambda x: isinstance(x, otype), stream): yield el.as_dict() with transaction.atomic(): report.update(org_importer.import_data(tfilter(Organization, stream))) report.update(person_importer.import_data(tfilter(Person, stream))) report.update(post_importer.import_data(tfilter(Post, stream))) report.update(membership_importer.import_data( tfilter(Membership, stream))) return report
def test_deduplication(): Organization.objects.create(id='us', name="United States Executive Branch", classification="executive", jurisdiction_id="us") Organization.objects.create(id='nc', name="North Carolina Executive Branch", classification="executive", jurisdiction_id="nc") pres = ScrapePost(label='executive', role='President', organization_id='~{"classification": "executive"}') vp = ScrapePost(label='vice-executive', role='Vice President', organization_id='~{"classification": "executive"}') gov = ScrapePost(label='executive', role='Governor', organization_id='~{"classification": "executive"}') # ensure pres, vp and gov are all imported # pres & gov - same label, different jurisdiction # vp & pres - same jurisdiction, different label us_oi = OrganizationImporter('us') nc_oi = OrganizationImporter('nc') PostImporter('us', us_oi).import_data([pres.as_dict(), vp.as_dict()]) PostImporter('nc', nc_oi).import_data([gov.as_dict()]) assert Post.objects.count() == 3 # ensure changing the role is allowed pres = ScrapePost(label='executive', role='King', organization_id='~{"classification": "executive"}') PostImporter('us', us_oi).import_data([pres.as_dict()]) # no new object, just an update for role assert Post.objects.count() == 3 assert Post.objects.get(organization_id='us', label='executive').role == 'King'
def do_import(stream, transaction): stream = list(stream) jurisdiction_id = transaction.jurisdiction.id juris_importer = JurisdictionImporter(jurisdiction_id) org_importer = OrganizationImporter(jurisdiction_id) person_importer = PersonImporter(jurisdiction_id) post_importer = PostImporter(jurisdiction_id, org_importer) membership_importer = MembershipImporter( jurisdiction_id, person_importer, org_importer, post_importer ) report = {} # This basically relates to Pupa's pupa.clu.commands.update:113 # (From there - wrap this in a transaction.) def tfilter(otype, stream): for el in filter(lambda x: isinstance(x, otype), stream): yield el.as_dict() report.update(juris_importer.import_data(tfilter(Jurisdiction, stream))) report.update(org_importer.import_data(tfilter(Organization, stream))) report.update(person_importer.import_data(tfilter(Person, stream))) report.update(post_importer.import_data(tfilter(Post, stream))) report.update(membership_importer.import_data( tfilter(Membership, stream))) return report
def do_import(self, juris, args): datadir = os.path.join(settings.SCRAPED_DATA_DIR, args.module) juris_importer = JurisdictionImporter(juris.jurisdiction_id) org_importer = OrganizationImporter(juris.jurisdiction_id) person_importer = PersonImporter(juris.jurisdiction_id) post_importer = PostImporter(juris.jurisdiction_id, org_importer) membership_importer = MembershipImporter(juris.jurisdiction_id, person_importer, org_importer, post_importer) #bill_importer = BillImporter(juris.jurisdiction_id, org_importer) #vote_importer = VoteImporter(juris.jurisdiction_id, person_importer, org_importer, # bill_importer) #event_importer = EventImporter(juris.jurisdiction_id) report = {} # TODO: wrap in a transaction report.update(juris_importer.import_directory(datadir)) report.update(org_importer.import_directory(datadir)) report.update(person_importer.import_directory(datadir)) report.update(post_importer.import_directory(datadir)) report.update(membership_importer.import_directory(datadir)) #report.update(bill_importer.import_from_json(datadir)) #report.update(event_importer.import_from_json(datadir)) #report.update(vote_importer.import_from_json(datadir)) return report
def do_import(stream, transaction): stream = list(stream) jurisdiction_id = transaction.jurisdiction.id juris_importer = JurisdictionImporter(jurisdiction_id) org_importer = OrganizationImporter(jurisdiction_id) person_importer = PersonImporter(jurisdiction_id) post_importer = PostImporter(jurisdiction_id, org_importer) membership_importer = MembershipImporter(jurisdiction_id, person_importer, org_importer, post_importer) report = {} # This basically relates to Pupa's pupa.clu.commands.update:113 # (From there - wrap this in a transaction.) def tfilter(otype, stream): for el in filter(lambda x: isinstance(x, otype), stream): yield el.as_dict() report.update(juris_importer.import_data(tfilter(Jurisdiction, stream))) report.update(org_importer.import_data(tfilter(Organization, stream))) report.update(person_importer.import_data(tfilter(Person, stream))) report.update(post_importer.import_data(tfilter(Post, stream))) report.update(membership_importer.import_data(tfilter(Membership, stream))) return report
def test_fix_bill_id(): j = create_jurisdiction() j.legislative_sessions.create(name='1900', identifier='1900') org1 = ScrapeOrganization(name='House', classification='lower') bill = ScrapeBill('HB 1', '1900', 'Test Bill ID', classification='bill', chamber='lower') oi = OrganizationImporter('jid') oi.import_data([org1.as_dict()]) from pupa.settings import IMPORT_TRANSFORMERS IMPORT_TRANSFORMERS['bill'] = { 'identifier': lambda x: re.sub(r'([A-Z]*)\s*0*([-\d]+)', r'\1 \2', x, 1) } bi = BillImporter('jid', oi, DumbMockImporter()) bi.import_data([bill.as_dict()]) ve = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-02', classification='passage:bill', result='fail', bill_chamber='lower', bill='HB1', identifier='4', bill_action='passage', organization=org1._id) VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([ ve.as_dict(), ]) IMPORT_TRANSFORMERS['bill'] = {} ve = VoteEvent.objects.get() ve.bill.identifier == 'HB 1'
def do_import(self, juris, args): # import inside here because to avoid loading Django code unnecessarily from pupa.importers import (JurisdictionImporter, OrganizationImporter, PersonImporter, PostImporter, MembershipImporter, BillImporter, VoteEventImporter, EventImporter) from pupa.reports import generate_session_report from pupa.models import SessionDataQualityReport datadir = os.path.join(settings.SCRAPED_DATA_DIR, args.module) juris_importer = JurisdictionImporter(juris.jurisdiction_id) org_importer = OrganizationImporter(juris.jurisdiction_id) person_importer = PersonImporter(juris.jurisdiction_id) post_importer = PostImporter(juris.jurisdiction_id, org_importer) membership_importer = MembershipImporter(juris.jurisdiction_id, person_importer, org_importer, post_importer) bill_importer = BillImporter(juris.jurisdiction_id, org_importer, person_importer) vote_event_importer = VoteEventImporter(juris.jurisdiction_id, person_importer, org_importer, bill_importer) event_importer = EventImporter(juris.jurisdiction_id, org_importer, person_importer, bill_importer, vote_event_importer) report = {} with transaction.atomic(): print('import jurisdictions...') report.update(juris_importer.import_directory(datadir)) if settings.ENABLE_PEOPLE_AND_ORGS: print('import organizations...') report.update(org_importer.import_directory(datadir)) print('import people...') report.update(person_importer.import_directory(datadir)) print('import posts...') report.update(post_importer.import_directory(datadir)) print('import memberships...') report.update(membership_importer.import_directory(datadir)) if settings.ENABLE_BILLS: print('import bills...') report.update(bill_importer.import_directory(datadir)) if settings.ENABLE_EVENTS: print('import events...') report.update(event_importer.import_directory(datadir)) if settings.ENABLE_VOTES: print('import vote events...') report.update(vote_event_importer.import_directory(datadir)) # compile info on all sessions that were updated in this run seen_sessions = set() seen_sessions.update(bill_importer.get_seen_sessions()) seen_sessions.update(vote_event_importer.get_seen_sessions()) for session in seen_sessions: new_report = generate_session_report(session) with transaction.atomic(): SessionDataQualityReport.objects.filter(legislative_session=session).delete() new_report.save() return report
def test_deduplication_parties(): party = ScrapeOrganization('Wild', classification='party') OrganizationImporter('jurisdiction-id').import_data([party.as_dict()]) assert Organization.objects.count() == 1 # parties shouldn't get jurisdiction id attached, so don't differ on import party = ScrapeOrganization('Wild', classification='party') OrganizationImporter('new-jurisdiction-id').import_data([party.as_dict()]) assert Organization.objects.count() == 1
def test_deduplication_prevents_identical(): org1 = ScrapeOrganization('United Nations', classification='international') org2 = ScrapeOrganization('United Nations', classification='international', founding_date='1945') OrganizationImporter('jurisdiction-id').import_data([org1.as_dict()]) assert Organization.objects.count() == 1 OrganizationImporter('jurisdiction-id').import_data([org2.as_dict()]) assert Organization.objects.count() == 1
def test_locked_field(): create_jurisdiction() org = ScrapeOrganization('SHIELD').as_dict() oi = OrganizationImporter('jid') oi.import_data([org]) # set date and lock field o = Organization.objects.get() o.dissolution_date = '2015' o.locked_fields = ['dissolution_date'] o.save() # reimport org = ScrapeOrganization('SHIELD').as_dict() oi = OrganizationImporter('jid') oi.import_data([org]) o = Organization.objects.get() assert o.dissolution_date == '2015' assert o.locked_fields == ['dissolution_date'] # do it a third time to check for the locked_fields reversion issue org = ScrapeOrganization('SHIELD').as_dict() oi = OrganizationImporter('jid') oi.import_data([org]) o = Organization.objects.get() assert o.dissolution_date == '2015' assert o.locked_fields == ['dissolution_date']
def test_pseudo_ids(): create_jurisdictions() wild = Organization.objects.create(id='1', name='Wild', classification='party') senate = Organization.objects.create(id='2', name='Senate', classification='upper', jurisdiction_id='jid1') house = Organization.objects.create(id='3', name='House', classification='lower', jurisdiction_id='jid1') un = Organization.objects.create(id='4', name='United Nations', classification='international', jurisdiction_id='jid2') oi1 = OrganizationImporter('jid1') assert oi1.resolve_json_id('~{"classification":"upper"}') == senate.id assert oi1.resolve_json_id('~{"classification":"lower"}') == house.id assert oi1.resolve_json_id( '~{"classification":"party", "name":"Wild"}') == wild.id with pytest.raises(UnresolvedIdError): oi1.resolve_json_id( '~{"classification":"international", "name":"United Nations"}') oi2 = OrganizationImporter('jid2') assert (oi2.resolve_json_id( '~{"classification":"international", "name":"United Nations"}') == un.id)
def test_exception_on_identical_objects_in_import_stream(): # these two objects aren't identical, but refer to the same thing # at the moment we consider this an error (but there may be a better way to handle this?) o1 = ScrapeOrganization('X-Men', classification='unknown').as_dict() o2 = ScrapeOrganization('X-Men', founding_date='1970', classification='unknown').as_dict() pi = OrganizationImporter('jid') with pytest.raises(Exception): OrganizationImporter('jid').import_data([o1, o2])
def test_full_vote_event(): j = Jurisdiction.objects.create(id='jid', division_id='did') j.legislative_sessions.create(name='1900', identifier='1900') sp1 = ScrapePerson('John Smith', primary_org='lower') sp2 = ScrapePerson('Adam Smith', primary_org='lower') org = ScrapeOrganization(name='House', classification='lower') bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org._id) vote_event = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-01', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', organization=org._id) vote_event.set_count('yes', 20) vote_event.yes('John Smith') vote_event.no('Adam Smith') oi = OrganizationImporter('jid') oi.import_data([org.as_dict()]) pi = PersonImporter('jid') pi.import_data([sp1.as_dict(), sp2.as_dict()]) mi = MembershipImporter('jid', pi, oi, DumbMockImporter()) mi.import_data([sp1._related[0].as_dict(), sp2._related[0].as_dict()]) bi = BillImporter('jid', oi, pi) bi.import_data([bill.as_dict()]) VoteEventImporter('jid', pi, oi, bi).import_data([vote_event.as_dict()]) assert VoteEvent.objects.count() == 1 ve = VoteEvent.objects.get() assert ve.legislative_session == LegislativeSession.objects.get() assert ve.motion_classification == ['passage:bill'] assert ve.bill == Bill.objects.get() count = ve.counts.get() assert count.option == 'yes' assert count.value == 20 votes = list(ve.votes.all()) assert len(votes) == 2 for v in ve.votes.all(): if v.voter_name == 'John Smith': assert v.option == 'yes' assert v.voter == Person.objects.get(name='John Smith') else: assert v.option == 'no' assert v.voter == Person.objects.get(name='Adam Smith')
def do_import(self, juris, args): # import inside here because to avoid loading Django code unnecessarily from pupa.importers import (JurisdictionImporter, OrganizationImporter, PersonImporter, PostImporter, MembershipImporter, BillImporter, VoteImporter, EventImporter, DisclosureImporter) datadir = os.path.join(settings.SCRAPED_DATA_DIR, args.module) juris_importer = JurisdictionImporter(juris.jurisdiction_id) org_importer = OrganizationImporter(juris.jurisdiction_id) person_importer = PersonImporter(juris.jurisdiction_id) post_importer = PostImporter(juris.jurisdiction_id, org_importer) membership_importer = MembershipImporter(juris.jurisdiction_id, person_importer, org_importer, post_importer) bill_importer = BillImporter(juris.jurisdiction_id, org_importer, person_importer) vote_importer = VoteImporter(juris.jurisdiction_id, person_importer, org_importer, bill_importer) event_importer = EventImporter(juris.jurisdiction_id, org_importer, person_importer) disclosure_importer = DisclosureImporter(juris.jurisdiction_id, org_importer, person_importer, event_importer) report = {} with transaction.atomic(): print('import jurisdictions...') report.update(juris_importer.import_directory(datadir)) print('import organizations...') report.update(org_importer.import_directory(datadir)) print('import people...') report.update(person_importer.import_directory(datadir)) print('import posts...') report.update(post_importer.import_directory(datadir)) print('import memberships...') report.update(membership_importer.import_directory(datadir)) print('import bills...') report.update(bill_importer.import_directory(datadir)) print('import events...') report.update(event_importer.import_directory(datadir)) print('import disclosures...') report.update(disclosure_importer.import_directory(datadir)) print('import votes...') report.update(vote_importer.import_directory(datadir)) return report
def test_vote_event_bill_actions_two_stage(): # this test is very similar to what we're testing in test_vote_event_bill_actions w/ # ve3 and ve4, that two bills that reference the same action won't conflict w/ the # OneToOneField, but in this case we do it in two stages so that the conflict is found # even if the votes weren't in the same scrape j = create_jurisdiction() j.legislative_sessions.create(name='1900', identifier='1900') org1 = ScrapeOrganization(name='House', classification='lower') bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org1._id) bill.add_action(description='passage', date='1900-04-02', chamber='lower') ve1 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-02', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', bill_action='passage', organization=org1._id) ve2 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-02', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', bill_action='passage', organization=org1._id) # disambiguate them ve1.pupa_id = 'one' ve2.pupa_id = 'two' oi = OrganizationImporter('jid') oi.import_data([org1.as_dict()]) bi = BillImporter('jid', oi, DumbMockImporter()) bi.import_data([bill.as_dict()]) # first imports just fine VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([ ve1.as_dict(), ]) votes = list(VoteEvent.objects.all()) assert len(votes) == 1 assert votes[0].bill_action is not None # when second is imported, ensure that action stays pinned to first just as it would # have if they were both in same import VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([ ve1.as_dict(), ve2.as_dict(), ]) votes = list(VoteEvent.objects.all()) assert len(votes) == 2 assert votes[0].bill_action is not None assert votes[1].bill_action is None
def test_deduplication_other_name_exists(): create_jurisdictions() create_org() org = ScrapeOrganization('UN', classification='international') od = org.as_dict() OrganizationImporter('jid1').import_data([od]) assert Organization.objects.all().count() == 1
def test_multiple_orgs_of_same_class(): """ We should be able to set memberships on organizations with the same classification within the same jurisdictions """ create_jurisdiction() Organization.objects.create(id="fnd", name="Foundation", classification="foundation", jurisdiction_id="fnd-jid") Organization.objects.create(id="fdr", name="Federation", classification="foundation", jurisdiction_id="fnd-jid") hari = ScrapePerson('Hari Seldon', primary_org='foundation', role='founder', primary_org_name='Foundation') picard = ScrapePerson('Jean Luc Picard', primary_org='foundation', role='founder', primary_org_name='Federation') person_imp = PersonImporter('fnd-jid') person_imp.import_data([hari.as_dict()]) person_imp.import_data([picard.as_dict()]) # try to import a membership org_imp = OrganizationImporter('fnd-jid') dumb_imp = DumbMockImporter() memimp = MembershipImporter('fnd-jid', person_imp, org_imp, dumb_imp) memimp.import_data([hari._related[0].as_dict(), picard._related[0].as_dict()]) assert Person.objects.get(name='Hari Seldon').memberships.get().organization.name == 'Foundation' assert Person.objects.get(name='Jean Luc Picard').memberships.get().organization.name == 'Federation'
def test_parent_id_resolution(): parent = ScrapeOrganization('UN', classification='international') child = ScrapeOrganization('UNESCO', classification='unknown', parent_id=parent._id) OrganizationImporter('jurisdiction-id').import_data([parent.as_dict(), child.as_dict()]) assert Organization.objects.count() == 2 assert Organization.objects.get(name='UN').children.count() == 1 assert Organization.objects.get(name='UNESCO').parent.name == 'UN'
def test_vote_event_identifier_dedupe(): j = create_jurisdiction() j.legislative_sessions.create(name='1900', identifier='1900') Organization.objects.create(id='org-id', name='Legislature', classification='legislature', jurisdiction=j) vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something', identifier='Roll Call No. 1') dmi = DumbMockImporter() oi = OrganizationImporter('jid') bi = BillImporter('jid', dmi, oi) _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 1 # same exact vote event, no changes _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'noop' assert VoteEvent.objects.count() == 1 # new info, update vote_event.result = 'failed' _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'update' assert VoteEvent.objects.count() == 1 # new bill, insert vote_event.identifier = 'Roll Call 2' _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 2
def test_extras_organization(): org = ScrapeOrganization('United Nations', classification='international') org.extras = {"hello": "world", "foo": {"bar": "baz"}} od = org.as_dict() OrganizationImporter('jurisdiction-id').import_data([od]) o = Organization.objects.get() assert o.extras['foo']['bar'] == 'baz'
def test_full_organization(): create_jurisdictions() org = ScrapeOrganization('United Nations', classification='international') org.add_identifier('un') org.add_name('UN', start_date='1945') org.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') org.add_link('http://example.com/link') org.add_source('http://example.com/source') # import org od = org.as_dict() OrganizationImporter('jid1').import_data([od]) # get person from db and assert it imported correctly o = Organization.objects.get() assert 'ocd-organization' in o.id assert o.name == org.name assert o.identifiers.all()[0].identifier == 'un' assert o.identifiers.all()[0].scheme == '' assert o.other_names.all()[0].name == 'UN' assert o.other_names.all()[0].start_date == '1945' assert o.contact_details.all()[0].type == 'phone' assert o.contact_details.all()[0].value == '555-555-1234' assert o.contact_details.all()[0].note == 'this is fake' assert o.links.all()[0].url == 'http://example.com/link' assert o.sources.all()[0].url == 'http://example.com/source'
def test_full_post(): create_jurisdictions() org = Organization.objects.create(name="United States Executive Branch", classification="executive", jurisdiction_id="us") post = ScrapePost(label='executive', role='President', organization_id='~{"classification": "executive"}', start_date=datetime.date(2015, 5, 18), end_date='2015-05-19', maximum_memberships=2 ) post.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') post.add_link('http://example.com/link') # import post oi = OrganizationImporter('us') PostImporter('jurisdiction-id', oi).import_data([post.as_dict()]) print(post.as_dict()) # get person from db and assert it imported correctly p = Post.objects.get() assert 'ocd-post' in p.id assert p.label == post.label assert p.role == post.role assert p.organization_id == org.id assert p.maximum_memberships == 2 assert p.contact_details.all()[0].type == 'phone' assert p.contact_details.all()[0].value == '555-555-1234' assert p.contact_details.all()[0].note == 'this is fake' assert p.links.all()[0].url == 'http://example.com/link' assert p.start_date == '2015-05-18' assert p.end_date == '2015-05-19'
def test_resolve_special_json_id(): Organization.objects.create(id='us', name="United States Executive Branch", classification="executive", jurisdiction_id="us") Organization.objects.create(id='nc', name="North Carolina Executive Branch", classification="executive", jurisdiction_id="nc") Post.objects.create(id='pres', label='executive', role='President', organization_id='us') Post.objects.create(id='vpres', label='vice-executive', role='Vice President', organization_id='us') Post.objects.create(id='gov', label='executive', role='Governor', organization_id='nc') oi = OrganizationImporter('') assert PostImporter( 'us', oi).resolve_json_id('~{"label": "executive"}') == 'pres' assert PostImporter( 'us', oi).resolve_json_id('~{"label": "vice-executive"}') == 'vpres' assert PostImporter('nc', oi).resolve_json_id('~{"label": "executive"}') == 'gov'
def test_bill_sponsor_by_identifier(): create_jurisdiction() org = create_org() bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', chamber='lower') bill.add_sponsorship_by_identifier(name="SNODGRASS", classification='sponsor', entity_type='person', primary=True, identifier="TOTALLY_REAL_ID", scheme="TOTALLY_REAL_SCHEME") oi = OrganizationImporter('jid') pi = PersonImporter('jid') zs = ScrapePerson(name='Zadock Snodgrass') zs.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') pi.import_data([zs.as_dict()]) za_db = Person.objects.get() Membership.objects.create(person_id=za_db.id, organization_id=org.id) BillImporter('jid', oi, pi).import_data([bill.as_dict()]) obj = Bill.objects.get() (entry, ) = obj.sponsorships.all() assert entry.person.name == "Zadock Snodgrass"
def test_full_post(): org = Organization.objects.create(name="United States Executive Branch", classification="executive", jurisdiction_id="jurisdiction-id") post = ScrapePost(label='executive', role='President', organization_id='~{"classification": "executive"}') post.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') post.add_link('http://example.com/link') # import post oi = OrganizationImporter('jurisdiction-id') PostImporter('jurisdiction-id', oi).import_data([post.as_dict()]) # get person from db and assert it imported correctly p = Post.objects.get() assert 'ocd-post' in p.id assert p.label == post.label assert p.role == post.role assert p.organization_id == org.id assert p.contact_details.all()[0].type == 'phone' assert p.contact_details.all()[0].value == '555-555-1234' assert p.contact_details.all()[0].note == 'this is fake' assert p.links.all()[0].url == 'http://example.com/link'
def test_no_membership_for_person_including_party(): """ even though party is specified we should still get a no memberships error because it doesn't bind the person to a jurisdiction, thus causing duplication """ create_jurisdiction() Organization.objects.create(id="fnd", name="Foundation", classification="foundation", jurisdiction_id="fnd-jid") Organization.objects.create(id="dem", name="Democratic", classification="party") # import a person with no memberships p = ScrapePerson('a man without a country', party='Democratic') person_imp = PersonImporter('fnd-jid') org_imp = OrganizationImporter('fnd-jid') person_imp.import_data([p.as_dict()]) # try to import a membership dumb_imp = DumbMockImporter() memimp = MembershipImporter('fnd-jid', person_imp, org_imp, dumb_imp) with pytest.raises(NoMembershipsError): memimp.import_data([p._related[0].as_dict()])
def test_vote_event_bill_id_dedupe(): j = create_jurisdiction() session = j.legislative_sessions.create(name='1900', identifier='1900') org = Organization.objects.create(id='org-id', name='House', classification='lower', jurisdiction=j) bill = Bill.objects.create(id='bill-1', identifier='HB 1', legislative_session=session, from_organization=org) bill2 = Bill.objects.create(id='bill-2', identifier='HB 2', legislative_session=session, from_organization=org) vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something', bill=bill.identifier, bill_chamber='lower', chamber='lower') dmi = DumbMockImporter() oi = OrganizationImporter('jid') bi = BillImporter('jid', dmi, oi) _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 1 # same exact vote event, no changes _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'noop' assert VoteEvent.objects.count() == 1 # new info, update vote_event.result = 'failed' _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'update' assert VoteEvent.objects.count() == 1 # new vote event, insert vote_event = ScrapeVoteEvent(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something', bill=bill2.identifier, bill_chamber='lower', chamber='lower') _, what = VoteEventImporter('jid', dmi, oi, bi).import_item(vote_event.as_dict()) assert what == 'insert' assert VoteEvent.objects.count() == 2
def do_import(self, juris, args): # import inside here because to avoid loading Django code unnecessarily from pupa.importers import (JurisdictionImporter, OrganizationImporter, PersonImporter, PostImporter, MembershipImporter, BillImporter, VoteEventImporter, EventImporter) datadir = os.path.join(settings.SCRAPED_DATA_DIR, args.module) juris_importer = JurisdictionImporter(juris.jurisdiction_id) org_importer = OrganizationImporter(juris.jurisdiction_id) person_importer = PersonImporter(juris.jurisdiction_id) post_importer = PostImporter(juris.jurisdiction_id, org_importer) membership_importer = MembershipImporter(juris.jurisdiction_id, person_importer, org_importer, post_importer) bill_importer = BillImporter(juris.jurisdiction_id, org_importer, person_importer) vote_event_importer = VoteEventImporter(juris.jurisdiction_id, person_importer, org_importer, bill_importer) event_importer = EventImporter(juris.jurisdiction_id, org_importer, person_importer, bill_importer, vote_event_importer) report = {} with transaction.atomic(): print('import jurisdictions...') report.update(juris_importer.import_directory(datadir)) print('import organizations...') report.update(org_importer.import_directory(datadir)) print('import people...') report.update(person_importer.import_directory(datadir)) print('import posts...') report.update(post_importer.import_directory(datadir)) print('import memberships...') report.update(membership_importer.import_directory(datadir)) print('import bills...') report.update(bill_importer.import_directory(datadir)) print('import events...') report.update(event_importer.import_directory(datadir)) print('import vote events...') report.update(vote_event_importer.import_directory(datadir)) return report
def test_deduplication_similar_but_different(): o1 = ScrapeOrganization('United Nations', classification='international') # different classification o2 = ScrapeOrganization('United Nations', classification='global') # different name o3 = ScrapeOrganization('United Nations of Earth', classification='international') # has a parent o4 = ScrapeOrganization('United Nations', classification='international', parent_id=o1._id) # similar, but no duplicates orgs = [o1.as_dict(), o2.as_dict(), o3.as_dict(), o4.as_dict()] OrganizationImporter('jurisdiction-id').import_data(orgs) assert Organization.objects.count() == 4 # should get a new one when jurisdiction_id changes o5 = ScrapeOrganization('United Nations', classification='international') OrganizationImporter('new-jurisdiction-id').import_data([o5.as_dict()]) assert Organization.objects.count() == 5
def test_bill_chamber_param(): create_jurisdiction() org = create_org() bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', chamber='lower') oi = OrganizationImporter('jid') BillImporter('jid', oi).import_data([bill.as_dict()]) assert Bill.objects.get().from_organization_id == org.id
def test_bill_update_subsubitem(): create_jurisdiction() create_org() oi = OrganizationImporter('jid') pi = PersonImporter('jid') # initial sub-subitem bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['insert'] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 1 # a second subsubitem, update bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') bill.add_version_link('printing', 'http://example.com/test.text', media_type='text/plain') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['update'] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 2 # same thing, noop bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') bill.add_version_link('printing', 'http://example.com/test.text', media_type='text/plain') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['noop'] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 2 # different link for second one, update bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') bill.add_version_link('printing', 'http://example.com/diff-link.txt', media_type='text/plain') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['update'] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 2 # delete one, update bill = ScrapeBill('HB 1', '1900', 'First Bill', chamber='lower') bill.add_version_link('printing', 'http://example.com/test.pdf', media_type='application/pdf') result = BillImporter('jid', oi, pi).import_data([bill.as_dict()]) assert result['bill']['update'] == 1 obj = Bill.objects.get() assert obj.versions.count() == 1 assert obj.versions.get().links.count() == 1
def test_pseudo_parent_id_resolution(): create_jurisdictions() parent = ScrapeOrganization('UN', classification='international') child = ScrapeOrganization( 'UNESCO', classification='unknown', parent_id='~{"classification": "international"}') OrganizationImporter('jid1').import_data( [parent.as_dict(), child.as_dict()]) assert Organization.objects.count() == 2 assert Organization.objects.get(name='UN').children.count() == 1 assert Organization.objects.get(name='UNESCO').parent.name == 'UN'
def test_vote_event_bill_clearing(): # ensure that we don't wind up with vote events sitting around forever on bills as # changes make it look like there are multiple vote events j = create_jurisdiction() session = j.legislative_sessions.create(name='1900', identifier='1900') org = Organization.objects.create(id='org-id', name='House', classification='lower', jurisdiction=j) bill = Bill.objects.create(id='bill-1', identifier='HB 1', legislative_session=session, from_organization=org) Bill.objects.create(id='bill-2', identifier='HB 2', legislative_session=session, from_organization=org) oi = OrganizationImporter('jid') dmi = DumbMockImporter() bi = BillImporter('jid', dmi, oi) vote_event1 = ScrapeVoteEvent( legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on somthing', # typo intentional bill=bill.identifier, bill_chamber='lower', chamber='lower') vote_event2 = ScrapeVoteEvent(legislative_session='1900', start_date='2013', classification='anything', result='passed', motion_text='a vote on something else', bill=bill.identifier, bill_chamber='lower', chamber='lower') # have to use import_data so postimport is called VoteEventImporter('jid', dmi, oi, bi).import_data( [vote_event1.as_dict(), vote_event2.as_dict()]) assert VoteEvent.objects.count() == 2 # a typo is fixed, we don't want 3 vote events now vote_event1.motion_text = 'a vote on something' VoteEventImporter('jid', dmi, oi, bi).import_data( [vote_event1.as_dict(), vote_event2.as_dict()]) assert VoteEvent.objects.count() == 2
def do_import(stream, stransaction): stream = list(stream) jurisdiction_id = stransaction.jurisdiction.id org_importer = OrganizationImporter(jurisdiction_id) person_importer = PersonImporter(jurisdiction_id) post_importer = PostImporter(jurisdiction_id, org_importer) membership_importer = MembershipImporter(jurisdiction_id, person_importer, org_importer, post_importer) report = {} def tfilter(otype, stream): for el in filter(lambda x: isinstance(x, otype), stream): yield el.as_dict() with transaction.atomic(): report.update(org_importer.import_data(tfilter(Organization, stream))) report.update(person_importer.import_data(tfilter(Person, stream))) report.update(post_importer.import_data(tfilter(Post, stream))) report.update( membership_importer.import_data(tfilter(Membership, stream))) return report
def test_locked_field_subitem(): org = ScrapeOrganization('SHIELD') org.add_name('S.H.I.E.L.D.') oi = OrganizationImporter('jid') oi.import_data([org.as_dict()]) # lock the field o = Organization.objects.get() o.locked_fields = ['other_names'] o.save() # reimport org = ScrapeOrganization('SHIELD').as_dict() oi = OrganizationImporter('jid') oi.import_data([org]) o = Organization.objects.get() assert o.other_names.get().name == 'S.H.I.E.L.D.'
def test_psuedo_ids(): wild = Organization.objects.create(id='1', name='Wild', classification='party') senate = Organization.objects.create(id='2', name='Senate', classification='legislature', chamber='upper', jurisdiction_id='jid1') house = Organization.objects.create(id='3', name='House', classification='legislature', chamber='lower', jurisdiction_id='jid1') un = Organization.objects.create(id='4', name='United Nations', classification='international', jurisdiction_id='jid2') oi1 = OrganizationImporter('jid1') assert oi1.resolve_json_id('~{"classification":"legislature", "chamber":"upper"}') == senate.id assert oi1.resolve_json_id('~{"classification":"legislature", "chamber":"lower"}') == house.id assert oi1.resolve_json_id('~{"classification":"party", "name":"Wild"}') == wild.id with pytest.raises(ValueError): oi1.resolve_json_id('~{"classification":"international", "name":"United Nations"}') oi2 = OrganizationImporter('jid2') assert (oi2.resolve_json_id('~{"classification":"international", "name":"United Nations"}') == un.id)
def test_deduplication_overlap_name_distinct_juris(): create_jurisdictions() org_jid_1 = Organization.objects.create(name='World Wrestling Federation', classification='international', jurisdiction_id='jid1') org_jid_1.other_names.create(name='WWF') org = ScrapeOrganization(name="WWF", classification="international") org.add_name('WWF') oi1 = OrganizationImporter('jid1') oi1.import_item(org.as_dict()) assert Organization.objects.count() == 1 oi2 = OrganizationImporter('jid2') oi2.import_item(org.as_dict()) assert Organization.objects.count() == 2
def test_locked_field_subitem(): create_jurisdiction() org = ScrapeOrganization('SHIELD') org.add_name('S.H.I.E.L.D.') oi = OrganizationImporter('jid') oi.import_data([org.as_dict()]) # lock the field o = Organization.objects.get() o.locked_fields = ['other_names'] o.save() # reimport org = ScrapeOrganization('SHIELD').as_dict() oi = OrganizationImporter('jid') oi.import_data([org]) o = Organization.objects.get() assert o.other_names.get().name == 'S.H.I.E.L.D.'
def test_vote_event_bill_actions_errors(): j = create_jurisdiction() j.legislative_sessions.create(name='1900', identifier='1900') org1 = ScrapeOrganization(name='House', classification='lower') org2 = ScrapeOrganization(name='Senate', classification='upper') bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org1._id) # for this bill, two identical actions, so vote matching will fail bill.add_action(description='passage', date='1900-04-01', chamber='lower') bill.add_action(description='passage', date='1900-04-01', chamber='lower') # this action is good, but two votes will try to match it bill.add_action(description='passage', date='1900-04-02', chamber='lower') # will match two actions ve1 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-01', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', identifier='1', bill_action='passage', organization=org1._id) # will match no actions ve2 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-01', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', identifier='2', bill_action='committee result', organization=org1._id) # these two votes will both match the same action ve3 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-02', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', identifier='3', bill_action='passage', organization=org1._id) ve4 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage-syz', start_date='1900-04-02', classification='passage:bill', result='fail', bill_chamber='lower', bill='HB 1', identifier='4', bill_action='passage', organization=org1._id) oi = OrganizationImporter('jid') oi.import_data([org1.as_dict(), org2.as_dict()]) bi = BillImporter('jid', oi, DumbMockImporter()) bi.import_data([bill.as_dict()]) VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([ ve1.as_dict(), ve2.as_dict(), ve3.as_dict(), ve4.as_dict(), ]) bill = Bill.objects.get() votes = list(VoteEvent.objects.all()) # isn't matched, was ambiguous across two actions assert votes[0].bill_action is None # isn't matched, no match in actions assert votes[1].bill_action is None # these both try to match the same action, only first will succeed assert votes[2].bill_action is not None assert votes[3].bill_action is None
def test_vote_event_bill_actions(): j = create_jurisdiction() j.legislative_sessions.create(name='1900', identifier='1900') org1 = ScrapeOrganization(name='House', classification='lower') org2 = ScrapeOrganization(name='Senate', classification='upper') bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org1._id) # add actions, passage of upper & lower on same day, something else, # then passage in upper again on a different day bill.add_action(description='passage', date='1900-04-01', chamber='upper') bill.add_action(description='passage', date='1900-04-01', chamber='lower') bill.add_action(description='other event', date='1900-04-01', chamber='lower') bill.add_action(description='passage', date='1900-04-02', chamber='upper') # four passage votes, one per chamber, one on 04-01, and one on 04-02 ve1 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-01', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', bill_action='passage', organization=org1._id) ve2 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-01', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', bill_action='passage', organization=org2._id) ve3 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-02', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', bill_action='passage', organization=org1._id) ve4 = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-02', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', bill_action='passage', organization=org2._id) oi = OrganizationImporter('jid') oi.import_data([org1.as_dict(), org2.as_dict()]) bi = BillImporter('jid', oi, DumbMockImporter()) bi.import_data([bill.as_dict()]) VoteEventImporter('jid', DumbMockImporter(), oi, bi).import_data([ ve1.as_dict(), ve2.as_dict(), ve3.as_dict(), ve4.as_dict(), ]) bill = Bill.objects.get() votes = list(VoteEvent.objects.all()) actions = list(bill.actions.all()) assert len(actions) == 4 assert len(votes) == 4 votes = {(v.organization.classification, v.start_date): v.bill_action for v in votes} # ensure that votes are matched using action, chamber, and date assert votes[('upper', '1900-04-01')] == actions[0] assert votes[('lower', '1900-04-01')] == actions[1] assert votes[('upper', '1900-04-02')] == actions[3] assert votes[('lower', '1900-04-02')] is None
def test_full_bill(): create_jurisdiction() person = Person.objects.create(id='person-id', name='Adam Smith') org = ScrapeOrganization(name='House', classification='lower') com = ScrapeOrganization(name='Arbitrary Committee', classification='committee', parent_id=org._id) oldbill = ScrapeBill('HB 99', '1899', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', from_organization=org._id) bill.subject = ['taxes', 'axes'] bill.add_identifier('SB 9') bill.add_title('Tack & Axe Tax Act') bill.add_action('introduced in house', '1900-04-01', chamber='lower') act = bill.add_action('sent to arbitrary committee', '1900-04-04', chamber='lower') act.add_related_entity('arbitrary committee', 'organization', com._id) bill.add_related_bill("HB 99", legislative_session="1899", relation_type="prior-session") bill.add_sponsorship('Adam Smith', classification='extra sponsor', entity_type='person', primary=False, entity_id=person.id) bill.add_sponsorship('Jane Smith', classification='lead sponsor', entity_type='person', primary=True) bill.add_abstract('This is an act about axes and taxes and tacks.', note="official") bill.add_document_link('Fiscal Note', 'http://example.com/fn.pdf', media_type='application/pdf') bill.add_document_link('Fiscal Note', 'http://example.com/fn.html', media_type='text/html') bill.add_version_link('Fiscal Note', 'http://example.com/v/1', media_type='text/html') bill.add_source('http://example.com/source') # import bill oi = OrganizationImporter('jid') oi.import_data([org.as_dict(), com.as_dict()]) pi = PersonImporter('jid') pi.json_to_db_id['person-id'] = 'person-id' # Since we have to create this person behind the back of the import # transaction, we'll fake the json-id to db-id, since they match in this # case. This is *really* getting at some implementation detail, but it's # the cleanest way to ensure we short-circut the json id lookup. BillImporter('jid', oi, pi).import_data([oldbill.as_dict(), bill.as_dict()]) # get bill from db and assert it imported correctly b = Bill.objects.get(identifier='HB 1') assert b.from_organization.classification == 'lower' assert b.identifier == bill.identifier assert b.title == bill.title assert b.classification == bill.classification assert b.subject == ['taxes', 'axes'] assert b.abstracts.get().note == 'official' # other_title, other_identifier added assert b.other_titles.get().title == 'Tack & Axe Tax Act' assert b.other_identifiers.get().identifier == 'SB 9' # actions actions = list(b.actions.all()) assert len(actions) == 2 # ensure order was preserved (if this breaks it'll be intermittent) assert actions[0].organization == Organization.objects.get(classification='lower') assert actions[0].description == "introduced in house" assert actions[1].description == "sent to arbitrary committee" assert (actions[1].related_entities.get().organization == Organization.objects.get(classification='committee')) # related_bills were added rb = b.related_bills.get() assert rb.identifier == 'HB 99' # and bill got resolved assert rb.related_bill.identifier == 'HB 99' # sponsors added, linked & unlinked sponsorships = b.sponsorships.all() assert len(sponsorships) == 2 for ss in sponsorships: if ss.primary: assert ss.person is None assert ss.organization is None else: assert ss.person == person # versions & documents with their links versions = b.versions.all() assert len(versions) == 1 assert versions[0].links.count() == 1 documents = b.documents.all() assert len(documents) == 1 assert documents[0].links.count() == 2 # sources assert b.sources.count() == 1