def test_bill_sponsor_by_identifier(): create_jurisdiction() org = create_org() bill = ScrapeBill( "HB 1", "1900", "Axe & Tack Tax Act", classification="tax bill", chamber="lower" ) bill.add_sponsorship_by_identifier( name="SNODGRASS", classification="sponsor", entity_type="person", primary=True, identifier="TOTALLY_REAL_ID", scheme="TOTALLY_REAL_SCHEME", ) oi = OrganizationImporter("jid") pi = PersonImporter("jid") zs = ScrapePerson(name="Zadock Snodgrass") zs.add_identifier(identifier="TOTALLY_REAL_ID", scheme="TOTALLY_REAL_SCHEME") pi.import_data([zs.as_dict()]) za_db = Person.objects.get() Membership.objects.create(person_id=za_db.id, organization_id=org.id) BillImporter("jid", oi, pi).import_data([bill.as_dict()]) obj = Bill.objects.get() (entry,) = obj.sponsorships.all() assert entry.person.name == "Zadock Snodgrass"
def scrape_member_page(self, chamber, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for legislator in page.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), " "' memberModule ')]" ): img = legislator.xpath(".//div[@class='thumbnail']//img")[0].attrib["src"] data = legislator.xpath(".//div[@class='data']")[0] homepage = data.xpath(".//a[@class='black']")[0] full_name = homepage.text_content() if "Vacant" in full_name: continue homepage = homepage.attrib["href"] party = data.xpath(".//span[@class='partyLetter']")[0].text_content() party = {"R": "Republican", "D": "Democratic"}[party] office_lines = data.xpath("child::text()") phone = office_lines.pop(-1) if re.search(r"(Leader|Whip|Speaker)", office_lines[0]): office_lines.pop(0) office = "\n".join(office_lines) h3 = data.xpath("./h3") if len(h3): h3 = h3[0] district = h3.xpath("./br")[0].tail.replace("District", "").strip() else: district = re.findall(r"\d+\.png", legislator.attrib["style"])[ -1 ].split(".", 1)[0] full_name = re.sub(r"\s+", " ", full_name).strip() email = ( "rep{0:0{width}}@ohiohouse.gov" if chamber == "lower" else "sd{0:0{width}}@ohiosenate.gov" ).format(int(district), width=2) leg = Person( name=full_name, district=district, party=party, primary_org=chamber, image=img, ) leg.add_contact_detail(type="address", value=office, note="Capitol Office") leg.add_contact_detail(type="voice", value=phone, note="Capitol Office") leg.add_contact_detail(type="email", value=email, note="Capitol Office") self.scrape_homepage(leg, chamber, homepage) leg.add_source(url) leg.add_link(homepage) yield leg
def test_no_membership_for_person_including_party(): """ even though party is specified we should still get a no memberships error because it doesn't bind the person to a jurisdiction, thus causing duplication """ create_jurisdiction() Organization.objects.create( id="fnd", name="Foundation", classification="foundation", jurisdiction_id="fnd-jid", ) Organization.objects.create(id="dem", name="Democratic", classification="party") # import a person with no memberships p = ScrapePerson("a man without a country", party="Democratic") person_imp = PersonImporter("fnd-jid") org_imp = OrganizationImporter("fnd-jid") person_imp.import_data([p.as_dict()]) # try to import a membership dumb_imp = DumbMockImporter() memimp = MembershipImporter("fnd-jid", person_imp, org_imp, dumb_imp) with pytest.raises(NoMembershipsError): memimp.import_data([p._related[0].as_dict()])
def scrape_lower(self, chamber): url = "http://www.house.mi.gov/mhrpublic/frmRepList.aspx" table = ["website", "district", "name", "party", "location", "phone", "email"] data = self.get(url).text doc = lxml.html.fromstring(data) # skip two rows at top for row in doc.xpath('//table[@id="grvRepInfo"]/*'): tds = row.xpath(".//td") if len(tds) == 0: continue metainf = {} for i in range(0, len(table)): metainf[table[i]] = tds[i] district = str(int(metainf["district"].text_content().strip())) party = metainf["party"].text_content().strip() phone = metainf["phone"].text_content().strip() email = metainf["email"].text_content().strip() name = metainf["name"].text_content().strip() if name == "Vacant" or re.match(r"^District \d{1,3}$", name): self.warning( "District {} appears vacant, and will be skipped".format(district) ) continue leg_url = metainf["website"].xpath("./a")[0].attrib["href"] office = metainf["location"].text_content().strip() office = re.sub( " HOB", " Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933", office, ) office = re.sub(" CB", " State Capitol Building\nLansing, MI 48909", office) try: photo_url = self.get_photo_url(leg_url)[0] except (scrapelib.HTTPError, IndexError): photo_url = "" self.warning("no photo url for %s", name) person = Person( name=name, district=district, party=abbr[party], primary_org="lower", image=photo_url, ) person.add_link(leg_url) person.add_source(leg_url) person.add_contact_detail( type="address", value=office, note="Capitol Office" ) person.add_contact_detail(type="voice", value=phone, note="Capitol Office") person.add_contact_detail(type="email", value=email, note="Capitol Office") yield person
def test_deduplication_no_name_overlap(): create_jurisdiction() create_person() # make sure we're not just being ridiculous and avoiding importing anything in the same org person = ScrapePerson("CM Punk") pd = person.as_dict() PersonImporter("jid").import_data([pd]) assert Person.objects.all().count() == 2
def test_person_add_party(): p = Person("Groot") p.add_party("Green") p._related[0].validate() assert get_pseudo_id(p._related[0].organization_id) == { "name": "Green", "classification": "party", }
def test_deduplication_other_name_exists(): create_jurisdiction() create_person() # Rocky is already saved in other_names person = ScrapePerson("Rocky") pd = person.as_dict() PersonImporter("jid").import_data([pd]) assert Person.objects.all().count() == 1
def test_deduplication_same_name(): create_jurisdiction() create_person() # simplest case- just the same name person = ScrapePerson("Dwayne Johnson") pd = person.as_dict() PersonImporter("jid").import_data([pd]) assert Person.objects.all().count() == 1
def test_deduplication_no_jurisdiction_overlap(): create_jurisdiction() create_person() # make sure we get a new person if we're in a different org person = ScrapePerson("Dwayne Johnson") pd = person.as_dict() PersonImporter("new-jurisdiction-id").import_data([pd]) assert Person.objects.all().count() == 2
def test_invalid_fields_related_item(): p1 = ScrapePerson("Dwayne") p1.add_link("http://example.com") p1 = p1.as_dict() p1["links"][0]["test"] = 3 with pytest.raises(DataImportError): PersonImporter("jid").import_data([p1])
def scrape_rep(self, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) main = page.xpath('//div[@id="main-info"]')[0] if "Resigned" in main.text_content(): print("Member resigned {}".format(url)) raise StopIteration # don't yield anything if "Deceased" in main.text_content(): print("Member is deceased {}".format(url)) raise StopIteration # don't yield anything name = page.xpath('//div[@class="member-name"]/text()')[0].strip() name = re.sub(r"\s+", " ", name) district_number = page.xpath( '//span[contains(text(), "House District:")]' "/following-sibling::span/text()")[0].strip() # remove anything after first whitespace district_number = re.sub(r"\s.*", "", district_number.strip()) email = None email_content = page.xpath( '//a[./i[contains(@class,"fa-envelope")]]/text()') if email_content and email_content[0].strip(): email = email_content[0].strip() photo_url = page.xpath('//header[@id="home"]/img/@src')[0] party = self.get_rep_table_by_header(page, "Party Affiliation").text.strip() party = _party_map[party[0]] # standardize main_p_text = page.xpath('//div[@id="main-info"]/p/text()') address = [t.strip() for t in main_p_text if t.strip()][0] person = Person( name=name, district=district_number, primary_org="lower", party=party, image=photo_url, ) person.add_contact_detail(type="address", value=address, note="District Office") if email: person.add_contact_detail(type="email", value=email, note="District Office") person.add_link(url) person.add_source(url) yield person
def scrape_senator_page(self, chamber, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for legislator in page.xpath( "//div[@id='senators']//div[contains(concat(' ', normalize-space(@class), ' '), " "' portraitContainer ')]" ): img = legislator.xpath( ".//div[@class='profileThumbnailBoundingBox']/@style" )[0] img = img[img.find("(") + 1 : img.find(")")] full_name = legislator.xpath(".//div[@class='profileName']/a/text()")[0] homepage_url = legislator.xpath(".//a[@class='profileImageLink']")[ 0 ].attrib["href"] district = legislator.xpath(".//div[@class='profileDistrict']" "/a/text()")[ 0 ].split("#")[1] if "Vacant" in full_name: continue homepage = self.get(homepage_url).text page = lxml.html.fromstring(homepage) phone = page.xpath("//div[@class='phone']/span/text()")[0] address_lines = page.xpath("//div[@class='address']/descendant::*/text()") address = "\n".join(address_lines) party_image = page.xpath('//div[@class="senatorParty"]/img/@src')[0] if "Republican" in party_image: party = "Republican" elif "Democrat" in party_image: party = "Democratic" email = ( "rep{0:0{width}}@ohiohouse.gov" if chamber == "lower" else "sd{0:0{width}}@ohiosenate.gov" ).format(int(district), width=2) leg = Person( name=full_name, district=district, primary_org=chamber, image=img, party=party, ) leg.add_contact_detail(type="address", value=address, note="Capitol Office") leg.add_contact_detail(type="voice", value=phone, note="Capitol Office") leg.add_contact_detail(type="email", value=email, note="Capitol Office") leg.add_source(url) leg.add_link(homepage_url) yield leg
def test_person_add_term(): p = Person("Eternal") p.add_term("eternal", "council", start_date="0001", end_date="9999") p._related[0].validate() assert get_pseudo_id(p._related[0].organization_id) == { "classification": "council" } assert p._related[0].start_date == "0001" assert p._related[0].end_date == "9999"
def test_deduplication_other_name_overlaps(): create_jurisdiction() create_person() # Person has other_name that overlaps w/ existing name person = ScrapePerson("The Rock") person.add_name("Dwayne Johnson") pd = person.as_dict() PersonImporter("jid").import_data([pd]) assert Person.objects.all().count() == 1
def test_full_vote_event(): j = create_jurisdiction() j.legislative_sessions.create(name="1900", identifier="1900") sp1 = ScrapePerson("John Smith", primary_org="lower") sp2 = ScrapePerson("Adam Smith", primary_org="lower") org = ScrapeOrganization(name="House", classification="lower") bill = ScrapeBill("HB 1", "1900", "Axe & Tack Tax Act", from_organization=org._id) vote_event = ScrapeVoteEvent( legislative_session="1900", motion_text="passage", start_date="1900-04-01", classification="passage:bill", result="pass", bill_chamber="lower", bill="HB 1", organization=org._id, ) vote_event.set_count("yes", 20) vote_event.yes("John Smith") vote_event.no("Adam Smith") oi = OrganizationImporter("jid") oi.import_data([org.as_dict()]) pi = PersonImporter("jid") pi.import_data([sp1.as_dict(), sp2.as_dict()]) mi = MembershipImporter("jid", pi, oi, DumbMockImporter()) mi.import_data([sp1._related[0].as_dict(), sp2._related[0].as_dict()]) bi = BillImporter("jid", oi, pi) bi.import_data([bill.as_dict()]) VoteEventImporter("jid", pi, oi, bi).import_data([vote_event.as_dict()]) assert VoteEvent.objects.count() == 1 ve = VoteEvent.objects.get() assert ve.legislative_session == LegislativeSession.objects.get() assert ve.motion_classification == ["passage:bill"] assert ve.bill == Bill.objects.get() count = ve.counts.get() assert count.option == "yes" assert count.value == 20 votes = list(ve.votes.all()) assert len(votes) == 2 for v in ve.votes.all(): if v.voter_name == "John Smith": assert v.option == "yes" assert v.voter == Person.objects.get(name="John Smith") else: assert v.option == "no" assert v.voter == Person.objects.get(name="Adam Smith")
def test_legislator_related_party(): leg = Person("John Adams", party="Democratic-Republican") leg.pre_save("jurisdiction-id") # a party membership assert len(leg._related) == 1 assert leg._related[0].person_id == leg._id assert get_pseudo_id(leg._related[0].organization_id) == { "classification": "party", "name": "Democratic-Republican", } assert leg._related[0].role == "member"
def test_person_add_membership_name(): p = Person("Leonardo DiCaprio") p.add_membership("Academy of Motion Picture Arts and Sciences", role="winner", start_date="2016") p._related[0].validate() assert get_pseudo_id(p._related[0].organization_id) == { "name": "Academy of Motion Picture Arts and Sciences" } assert p._related[0].person_id == p._id assert p._related[0].role == "winner" assert p._related[0].start_date == "2016"
def test_basic_invalid_person(): bob = Person("Bob B. Johnson") bob.add_source(url="http://example.com") bob.validate() bob.name = None with pytest.raises(ScrapeValueError): bob.validate()
def test_legislator_related_district(): leg = Person("John Adams", district="1", primary_org="legislature") leg.pre_save("jurisdiction-id") assert len(leg._related) == 1 assert leg._related[0].person_id == leg._id assert get_pseudo_id(leg._related[0].organization_id) == { "classification": "legislature" } assert get_pseudo_id(leg._related[0].post_id) == { "organization__classification": "legislature", "label": "1", }
def test_save_object_basics(): # ensure that save object dumps a file s = Scraper(juris, "/tmp/") p = Person("Michael Jordan") p.add_source("http://example.com") with mock.patch("json.dump") as json_dump: s.save_object(p) # ensure object is saved in right place filename = "person_" + p._id + ".json" assert filename in s.output_names["person"] json_dump.assert_called_once_with(p.as_dict(), mock.ANY, cls=mock.ANY)
def handle_list_item(self, item): name = " ".join(item.xpath(".//text()")) name = re.sub(r"\s+", " ", name).replace(" ,", ",").strip() if "Vacant" in name: return district = item.xpath("string(../../td[1])") party = item.xpath("string(../../td[2])") if party == "Democrat": party = "Democratic" leg_url = item.get("href") name = fix_name(name) leg = Person( name=name, district=district, party=party, primary_org="upper", role="Senator", ) leg.add_link(leg_url) leg.add_source(self.url) leg.add_source(leg_url) self.scrape_page(SenDetail, leg_url, obj=leg) return leg
def test_multiple_orgs_of_same_class(): """ We should be able to set memberships on organizations with the same classification within the same jurisdictions """ create_jurisdiction() Organization.objects.create( id="fnd", name="Foundation", classification="foundation", jurisdiction_id="fnd-jid", ) Organization.objects.create( id="fdr", name="Federation", classification="foundation", jurisdiction_id="fnd-jid", ) hari = ScrapePerson( "Hari Seldon", primary_org="foundation", role="founder", primary_org_name="Foundation", ) picard = ScrapePerson( "Jean Luc Picard", primary_org="foundation", role="founder", primary_org_name="Federation", ) person_imp = PersonImporter("fnd-jid") person_imp.import_data([hari.as_dict()]) person_imp.import_data([picard.as_dict()]) # try to import a membership org_imp = OrganizationImporter("fnd-jid") dumb_imp = DumbMockImporter() memimp = MembershipImporter("fnd-jid", person_imp, org_imp, dumb_imp) memimp.import_data( [hari._related[0].as_dict(), picard._related[0].as_dict()]) assert (Person.objects.get( name="Hari Seldon").memberships.get().organization.name == "Foundation" ) assert (Person.objects.get(name="Jean Luc Picard").memberships.get(). organization.name == "Federation")
def test_save_related(): s = Scraper(juris, "/tmp/") p = Person("Michael Jordan") p.add_source("http://example.com") o = Organization("Chicago Bulls", classification="committee") o.add_source("http://example.com") p._related.append(o) with mock.patch("json.dump") as json_dump: s.save_object(p) assert json_dump.mock_calls == [ mock.call(p.as_dict(), mock.ANY, cls=mock.ANY), mock.call(o.as_dict(), mock.ANY, cls=mock.ANY), ]
def test_save_object_invalid(): s = Scraper(juris, "/tmp/") p = Person("Michael Jordan") # no source, won't validate with pytest.raises(ValueError): s.save_object(p)
def test_multiple_memberships(): create_jurisdiction() # there was a bug where two or more memberships to the same jurisdiction # would cause an ORM error, this test ensures that it is fixed p = Person.objects.create(name="Dwayne Johnson") o = Organization.objects.create(name="WWE", jurisdiction_id="jid") Membership.objects.create(person=p, organization=o) o = Organization.objects.create(name="WWF", jurisdiction_id="jid") Membership.objects.create(person=p, organization=o) person = ScrapePerson("Dwayne Johnson") pd = person.as_dict() PersonImporter("jid").import_data([pd]) # deduplication should still work assert Person.objects.all().count() == 1
def test_committee_add_member_person(): c = Organization("Defense", classification="committee") p = Person("John Adams") c.add_member(p, role="chairman") assert c._related[0].person_id == p._id assert c._related[0].organization_id == c._id assert c._related[0].role == "chairman"
def test_legislator_related_chamber_district_role(): leg = Person("John Adams", district="1", primary_org="lower", role="Speaker") leg.pre_save("jurisdiction-id") assert len(leg._related) == 1 assert leg._related[0].person_id == leg._id assert get_pseudo_id(leg._related[0].organization_id) == { "classification": "lower" } assert get_pseudo_id(leg._related[0].post_id) == { "organization__classification": "lower", "label": "1", "role": "Speaker", } assert leg._related[0].role == "Speaker"
def test_no_membership_for_person(): create_jurisdiction() Organization.objects.create( id="fnd", name="Foundation", classification="foundation", jurisdiction_id="fnd-jid", ) # import a person with no memberships p = ScrapePerson("a man without a country") person_imp = PersonImporter("fnd-jid") person_imp.import_data([p.as_dict()]) # try to import a membership dumb_imp = DumbMockImporter() memimp = MembershipImporter("fnd-jid", person_imp, dumb_imp, dumb_imp) with pytest.raises(NoMembershipsError): memimp.import_data([])
def handle_list_item(self, item): photo_url = item.xpath("./img/@src")[0] url = item.xpath(".//h5/a/@href")[0] name_text = item.xpath(".//h5/a/b/text()")[0] name_match = re.match(r"^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$", name_text) name = name_match.group(1).strip() district = name_match.group(2).lstrip("0").upper() party_text = name_match.group(3) party = PARTIES[party_text] info_texts = [ x.strip() for x in item.xpath("./div/text()[normalize-space()]") if x.strip() ] address = "\n".join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_text = item.xpath(".//a/@href")[1].replace("mailto:", "").strip() if validate_email_address(email_text): email = email_text rep = Person( name=name, district=district, party=party, primary_org="lower", role="Representative", image=photo_url, ) rep.add_link(url) rep.add_contact_detail(type="address", value=address, note="capitol") rep.add_contact_detail(type="voice", value=phone, note="capitol") rep.add_contact_detail(type="email", value=email, note="capitol") rep.add_source(self.url) yield rep
def scrape_member(self, chamber, link): name = link.text.strip() leg_url = link.get("href") district = link.xpath("string(../../td[3])") party = link.xpath("string(../../td[4])") # we get email on the next page now # email = link.xpath("string(../../td[5])") if party == "Democrat": party = "Democratic" elif party == "No Party Specified": party = "Independent" pid = re.search(r"personID=(\d+)", link.attrib["href"]).group(1) photo_url = ("https://www.legis.iowa.gov/photo" "?action=getPhoto&ga=%s&pid=%s" % (self.latest_session(), pid)) leg = Person( name=name, primary_org=chamber, district=district, party=party, image=photo_url, ) leg.add_link(leg_url) leg.add_source(leg_url) leg_page = lxml.html.fromstring(self.get(link.attrib["href"]).text) self.scrape_member_page(leg, leg_page) yield leg