def table_row_to_legislator_and_profile_url(table_row_element, chamber): """Derive a Legislator from an HTML table row lxml Element, and a link to their profile""" td_elements = table_row_element.xpath('td') (role_element, name_element, district_element, party_element, phone_element, email_element) = td_elements # Name comes in the form Last, First # last_name_first_name = name_element.text_content().strip() # full_name = last_name_first_name_to_full_name(last_name_first_name) full_name = name_element.text_content().strip() district = district_element.text_content().strip() party = party_element.text_content().strip() if party == 'Democrat': party = 'Democratic' role = role_element.text_content().strip() address = co_address_from_role(role) phone = phone_element.text_content().strip() email = email_element.text_content().strip() (profile_url, ) = name_element.xpath('a/@href') print(chamber, district, party) legislator = Person(primary_org=chamber, name=full_name, district=district, party=party) legislator.add_contact_detail(type='address', value=address, note='Capitol Office') legislator.add_contact_detail(type='voice', value=phone, note='Capitol Office') legislator.add_contact_detail(type='email', value=email, note='Capitol Office') return legislator, profile_url
def test_deduplication_no_jurisdiction_overlap(): create_person() # make sure we get a new person if we're in a different org person = ScrapePerson('Dwayne Johnson') pd = person.as_dict() PersonImporter('new-jurisdiction-id').import_data([pd]) assert Person.objects.all().count() == 2
def test_deduplication_no_name_overlap(): create_person() # make sure we're not just being ridiculous and avoiding importing anything in the same org person = ScrapePerson('CM Punk') pd = person.as_dict() PersonImporter('jurisdiction-id').import_data([pd]) assert Person.objects.all().count() == 2
def scrape_legislator(self, chamber, name, url): html = self.get(url).text page = lxml.html.fromstring(html) page.make_links_absolute(url) district = page.xpath('//h1[contains(., "DISTRICT")]/text()').pop() \ .split()[1].strip().lstrip('0') party = page.xpath('//h2').pop().text_content() party = re.search(r'\((R|D|I)[ \-\]]', party).group(1) if party == 'D': party = 'Democratic' elif party == 'R': party = 'Republican' elif party == 'I': party = 'Independent' photo_url = page.xpath( "//img[contains(@src, 'images/members/')]")[0].attrib['src'] leg = Person(name, district=district, party=party, image=photo_url, primary_org=chamber) leg.add_link(url) leg.add_source(url) self.scrape_offices(leg, page) yield leg
def get_council(self): council_doc = self.lxmlize(self.COUNCIL_URL) member_urls = council_doc.xpath( '//table[@summary="City Directory"]/tr//' 'a[contains(@href, "/directory.aspx?EID=")]/@href') for member_url in member_urls: member_doc = self.lxmlize(member_url) (name, ) = member_doc.xpath('//h1[@class="BioName"]/text()') (name, ) = re.findall(r'^(?:Mr\.|Mrs\.|Hon\.)?\s*(.*?)\s*$', name) # Returning everything into a list because the number of values returned varies # depending on if the person has an email or not text_list = member_doc.xpath( '//a[@class="BioLink"]/parent::div/text()') title = text_list[1].strip() (title, ) = re.findall( r'^Title: (Council Member,?(?: Ward \d)|Mayor)\s*$', title) try: (image_url, ) = member_doc.xpath( '//span[@class="BioText"]//img/@src') except ValueError: image_url = '' member = Person(name=name, image=image_url, primary_org='legislature', role=title) member.add_source(member_url) yield member
def get_council(self): council_doc = self.lxmlize(self.COUNCIL_URL) member_urls = council_doc.xpath( '//table[@summary="City Directory"]/tr//' 'a[contains(@href, "/directory.aspx?EID=")]/@href' ) for member_url in member_urls: member_doc = self.lxmlize(member_url) (name,) = member_doc.xpath('//span[@class="BioName"]/span/text()') (name,) = re.findall(r"^(?:Mr\.|Mrs\.|Hon\.)?\s*(.*?)\s*$", name) (title,) = member_doc.xpath('//a[@class="BioLink"]/following-sibling::text()') (title,) = re.findall(r"^Title: (Council Member(?: Ward \d)|Mayor)\s*$", title) try: (image_url,) = member_doc.xpath('//span[@class="BioText"]//img/@src') except ValueError: image_url = "" member = Person(name=name, image=image_url, primary_org="legislature", role=title) member.add_source(member_url) yield member
def scrape_csv(self, reader): for row in reader: contributor = Person( name="{Contact First Name} {Contact Last Name}".format(**row) ) contributor.add_source(SEARCH_URL) yield contributor
def test_bill_sponsor_by_identifier(): create_jurisdiction() org = create_org() bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', chamber='lower') bill.add_sponsorship_by_identifier(name="SNODGRASS", classification='sponsor', entity_type='person', primary=True, identifier="TOTALLY_REAL_ID", scheme="TOTALLY_REAL_SCHEME") oi = OrganizationImporter('jid') pi = PersonImporter('jid') zs = ScrapePerson(name='Zadock Snodgrass') zs.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') pi.import_data([zs.as_dict()]) za_db = Person.objects.get() Membership.objects.create(person_id=za_db.id, organization_id=org.id) BillImporter('jid', oi, pi).import_data([bill.as_dict()]) obj = Bill.objects.get() (entry,) = obj.sponsorships.all() assert entry.person.name == "Zadock Snodgrass"
def test_deduplication_other_name_exists(): create_person() # Rocky is already saved in other_names person = ScrapePerson('Rocky') pd = person.as_dict() PersonImporter('jurisdiction-id').import_data([pd]) assert Person.objects.all().count() == 1
def scrape_member(self, chamber, link): name = link.text.strip() leg_url = link.get('href') district = link.xpath("string(../../td[3])") party = link.xpath("string(../../td[4])") # we get email on the next page now # email = link.xpath("string(../../td[5])") if party == 'Democrat': party = 'Democratic' elif party == 'No Party Specified': party = 'Independent' pid = re.search(r"personID=(\d+)", link.attrib['href']).group(1) photo_url = ("https://www.legis.iowa.gov/photo" "?action=getPhoto&ga=%s&pid=%s" % (self.latest_session(), pid)) leg = Person( name=name, primary_org=chamber, district=district, party=party, image=photo_url) leg.add_link(leg_url) leg.add_source(leg_url) leg_page = lxml.html.fromstring(self.get(link.attrib['href']).text) self.scrape_member_page(leg, leg_page) yield leg
def test_deduplication_same_name(): create_person() # simplest case- just the same name person = ScrapePerson('Dwayne Johnson') pd = person.as_dict() PersonImporter('jurisdiction-id').import_data([pd]) assert Person.objects.all().count() == 1
def test_multiple_orgs_of_same_class(): """ We should be able to set memberships on organizations with the same classification within the same jurisdictions """ Organization.objects.create(id="fnd", name="Foundation", classification="foundation", jurisdiction_id="fnd-jid") Organization.objects.create(id="fdr", name="Federation", classification="foundation", jurisdiction_id="fnd-jid") hari = ScrapePerson('Hari Seldon', primary_org='foundation', role='founder', primary_org_name='Foundation') picard = ScrapePerson('Jean Luc Picard', primary_org='foundation', role='founder', primary_org_name='Federation') person_imp = PersonImporter('fnd-jid') person_imp.import_data([hari.as_dict()]) person_imp.import_data([picard.as_dict()]) # try to import a membership org_imp = OrganizationImporter('fnd-jid') dumb_imp = DumbMockImporter() memimp = MembershipImporter('fnd-jid', person_imp, org_imp, dumb_imp) memimp.import_data([hari._related[0].as_dict(), picard._related[0].as_dict()]) assert Person.objects.get(name='Hari Seldon').memberships.get().organization.name == 'Foundation' assert Person.objects.get(name='Jean Luc Picard').memberships.get().organization.name == 'Federation'
def scrape_counciler(self, url): page = self.lxmlize(url) who, = page.xpath("//h3[@class='subtitle']/text()") district, = page.xpath("//div[@class='right-bar']//h2/text()") image, = page.xpath( "//div[@class='left-bar']//a[@class='image lightbox']//img" ) member = Person( primary_org='legislature', name=who, district=district, image=image.attrib['src'] ) member.add_source(url) details = page.xpath("//table[@align='center']//td") for detail in details: detail = detail.text_content().strip() if detail is None or detail == "": continue type_, value = detail.split(":", 1) cdtype = { "Home Phone": "voice", "Address": "address", "Email": "email", "Cell Phone": "voice", }[type_] member.add_contact_detail(type=cdtype, note=type_, value=value) yield member
def scrape_upper_chamber(self, term): url = "http://oksenate.gov/Senators/Default.aspx" html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for a in doc.xpath('//table[@summary]')[0]. \ xpath('.//td//a[contains(@href, "biographies")]'): tail = a.xpath('..')[0].tail if tail: district = tail.split()[1] else: district = a.xpath('../../span')[1].text.split()[1] if a.text is None or a.text.strip() == 'Vacant': self.warning("District {} appears to be empty".format(district)) continue else: match = re.match(r'(.+) \(([A-Z])\)', a.text.strip()) name, party = match.group(1), self._parties[match.group(2)] url = a.get('href') person = Person(primary_org='upper', district=district, name=name.strip(), party=party, ) person.add_link(url) person.add_source(url) self.scrape_upper_offices(person, url) yield person
def scrape_chamber(self, chamber): self._party_map = { 'Democrat': 'Democratic', 'Republican': 'Republican', 'Non Affiliated': 'Independent', 'Not Affiliated': 'Independent', } if chamber == 'upper': url = 'http://senate.legis.state.ak.us/' else: url = 'http://house.legis.state.ak.us/' page = self.lxmlize(url) items = page.xpath('//ul[@class="item"]')[1].getchildren() for item in items: photo_url = item.xpath('.//img/@src')[0] name = item.xpath('.//strong/text()')[0] leg_url = item.xpath('.//a/@href')[0] email = item.xpath('.//a[text()="Email Me"]/@href') if email: email = email[0].replace('mailto:', '') else: self.warning('no email for ' + name) party = district = None skip = False for dt in item.xpath('.//dt'): dd = dt.xpath('following-sibling::dd')[0].text_content() label = dt.text.strip() if label == 'Party:': party = dd elif label == 'District:': district = dd elif label.startswith('Deceased'): skip = True self.warning('skipping deceased ' + name) break if skip: continue person = Person( primary_org=chamber, district=district, name=name, party=self._party_map[party], image=photo_url, ) person.add_source(leg_url) person.add_link(leg_url) # scrape offices self._scrape_offices(person, leg_url, email) yield person
def test_deduplication_other_name_overlaps(): create_person() # Person has other_name that overlaps w/ existing name person = ScrapePerson('The Rock') person.add_name('Dwayne Johnson') pd = person.as_dict() PersonImporter('jurisdiction-id').import_data([pd]) assert Person.objects.all().count() == 1
def test_invalid_fields_related_item(): p1 = ScrapePerson('Dwayne') p1.add_link('http://example.com') p1 = p1.as_dict() p1['links'][0]['test'] = 3 with pytest.raises(DataImportError): PersonImporter('jid').import_data([p1])
def test_legislator_related_chamber_district(): leg = Person('John Adams', district='1', primary_org='upper') leg.pre_save('jurisdiction-id') assert len(leg._related) == 1 assert leg._related[0].person_id == leg._id assert get_pseudo_id(leg._related[0].organization_id) == {'classification': 'upper'} assert get_pseudo_id(leg._related[0].post_id) == {"organization__classification": "upper", "label": "1"}
def test_person_add_term(): p = Person('Eternal') p.add_term('eternal', 'council', start_date='0001', end_date='9999') p._related[0].validate() assert get_pseudo_id(p._related[0].organization_id) == { 'classification': 'council', } assert p._related[0].start_date == '0001' assert p._related[0].end_date == '9999'
def test_person_add_membership(): p = Person('Bob B. Bear') p.add_source('http://example.com') o = Organization('test org', classification='unknown') p.add_membership(o, role='member', start_date='2007') assert len(p._related) == 1 p._related[0].validate() assert p._related[0].person_id == p._id assert p._related[0].organization_id == o._id assert p._related[0].start_date == '2007'
def test_person_add_membership_name(): p = Person('Leonardo DiCaprio') p.add_membership('Academy of Motion Picture Arts and Sciences', role='winner', start_date='2016') p._related[0].validate() assert get_pseudo_id(p._related[0].organization_id) == { 'name': 'Academy of Motion Picture Arts and Sciences'} assert p._related[0].person_id == p._id assert p._related[0].role == 'winner' assert p._related[0].start_date == '2016'
def test_legislator_related_district(): l = Person('John Adams', district='1', primary_org='legislature') l.pre_save('jurisdiction-id') assert len(l._related) == 1 assert l._related[0].person_id == l._id assert get_pseudo_id(l._related[0].organization_id) == {'classification': 'legislature'} assert get_pseudo_id(l._related[0].post_id) == {"organization__classification": "legislature", "label": "1", "role": "member"} assert l._related[0].role == 'member'
def test_legislator_related_party(): l = Person('John Adams', party='Democratic-Republican') l.pre_save('jurisdiction-id') # a party membership assert len(l._related) == 1 assert l._related[0].person_id == l._id assert get_pseudo_id(l._related[0].organization_id) == {'classification': 'party', 'name': 'Democratic-Republican'} assert l._related[0].role == 'member'
def handle_list_item(self, item): photo_url = item.xpath('./img/@src')[0] url = item.xpath('.//h5/a/@href')[0] name_text = item.xpath('.//h5/a/b/text()')[0] name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text) name = name_match.group(1).strip() district = name_match.group(2).lstrip('0').upper() party_text = name_match.group(3) party = PARTIES[party_text] info_texts = [x.strip() for x in item.xpath( './div/text()[normalize-space()]' ) if x.strip()] address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_text = item.xpath('.//a/@href')[1].replace('mailto:', '').strip() if validate_email_address(email_text): email = email_text rep = Person(name=name, district=district, party=party, primary_org='lower', role='Representative', image=photo_url) rep.add_link(url) rep.add_contact_detail(type='address', value=address, note='capitol') rep.add_contact_detail(type='voice', value=phone, note='capitol') rep.add_contact_detail(type='email', value=email, note='capitol') rep.add_source(self.url) yield rep
def scrape_lower(self, chamber): url = 'http://www.house.mi.gov/mhrpublic/frmRepList.aspx' table = [ "website", "district", "name", "party", "location", "phone", "email" ] data = self.get(url).text doc = lxml.html.fromstring(data) # skip two rows at top for row in doc.xpath('//table[@id="grvRepInfo"]/*'): tds = row.xpath('.//td') if len(tds) == 0: continue metainf = {} for i in range(0, len(table)): metainf[table[i]] = tds[i] district = str(int(metainf['district'].text_content().strip())) party = metainf['party'].text_content().strip() phone = metainf['phone'].text_content().strip() email = metainf['email'].text_content().strip() leg_url = metainf['website'].xpath("./a")[0].attrib['href'] name = metainf['name'].text_content().strip() if name == 'Vacant' or re.match(r'^District \d{1,3}$', name): self.warning('District {} appears vacant, and will be skipped'.format(district)) continue office = metainf['location'].text_content().strip() office = re.sub( ' HOB', ' Anderson House Office Building\n124 North Capitol Avenue\nLansing, MI 48933', office ) office = re.sub( ' CB', ' State Capitol Building\nLansing, MI 48909', office ) photo_url = self.get_photo_url(leg_url) person = Person(name=name, district=district, party=abbr[party], primary_org='lower', image=photo_url[0] if photo_url else None) person.add_link(leg_url) person.add_source(leg_url) person.add_contact_detail(type='address', value=office, note='Capitol Office') person.add_contact_detail(type='voice', value=phone, note='Capitol Office') person.add_contact_detail(type='email', value=email, note='Capitol Office') yield person
def test_save_related(): s = Scraper('jurisdiction', '/tmp/') p = Person('Michael Jordan') p.add_source('http://example.com') o = Organization('Chicago Bulls') o.add_source('http://example.com') p._related.append(o) with mock.patch('json.dump') as json_dump: s.save_object(p) assert json_dump.mock_calls == [mock.call(p.as_dict(), mock.ANY, cls=mock.ANY), mock.call(o.as_dict(), mock.ANY, cls=mock.ANY)]
def test_save_object_basics(): # ensure that save object dumps a file s = Scraper('jurisdiction', '/tmp/') p = Person('Michael Jordan') p.add_source('http://example.com') with mock.patch('json.dump') as json_dump: s.save_object(p) # ensure object is saved in right place filename = 'person_' + p._id + '.json' assert filename in s.output_names['person'] json_dump.assert_called_once_with(p.as_dict(), mock.ANY, cls=mock.ANY)
def scrape_member_page(self, chamber, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for legislator in page.xpath( "//div[contains(concat(' ', normalize-space(@class), ' '), " "' memberModule ')]" ): img = legislator.xpath( ".//div[@class='thumbnail']//img")[0].attrib['src'] data = legislator.xpath(".//div[@class='data']")[0] homepage = data.xpath(".//a[@class='black']")[0] full_name = homepage.text_content() if "Vacant" in full_name: continue homepage = homepage.attrib['href'] party = data.xpath( ".//span[@class='partyLetter']")[0].text_content() party = {"R": "Republican", "D": "Democratic"}[party] office_lines = data.xpath("child::text()") phone = office_lines.pop(-1) office = "\n".join(office_lines) h3 = data.xpath("./h3") if len(h3): h3 = h3[0] district = h3.xpath("./br")[0].tail.replace("District", "" ).strip() else: district = re.findall( r"\d+\.png", legislator.attrib['style'] )[-1].split(".", 1)[0] full_name = re.sub(r"\s+", " ", full_name).strip() email = ( 'rep{0:0{width}}@ohiohouse.gov' if chamber == 'lower' else 'sd{0:0{width}}@ohiosenate.gov' ).format(int(district), width=2) leg = Person(name=full_name, district=district, party=party, primary_org=chamber, image=img) leg.add_contact_detail(type='address', value=office, note='Capitol Office') leg.add_contact_detail(type='voice', value=phone, note='Capitol Office') leg.add_contact_detail(type='email', value=email, note='Capitol Office') self.scrape_homepage(leg, chamber, homepage) leg.add_source(url) leg.add_link(homepage) yield leg
def scrape_alderman(self, ward_num): ward_url = "{}/ward-{}".format(Utils.ALDERMEN_HOME, ward_num) alderman_url = self.alderman_url(ward_url) alderman_page = self.lxmlize(alderman_url) # person's name is the only <h1> tag on the page name = alderman_page.xpath("//h1/text()")[0] # initialize person object with appropriate data so that pupa can # automatically create a membership object linking this person to # a post in the jurisdiction's "Board of Aldermen" organization district = "Ward {} Alderman".format(ward_num) person = Person(name=name, district=district, role="Alderman", primary_org="legislature") # set additional fields person.image = alderman_page.xpath("//div/img/@src")[0] phone_number = alderman_page.xpath("//strong[text()='Phone:']/../text()")[1].strip() person.add_contact_detail(type="voice", value=phone_number) # add sources person.add_source(alderman_url, note="profile") person.add_source(ward_url, note="ward") return person
def test_no_membership_for_person(): org = Organization.objects.create(id="fnd", name="Foundation", classification="foundation", jurisdiction_id="fnd-jid") # import a person with no memberships p = ScrapePerson('a man without a country') person_imp = PersonImporter('fnd-jid') person_imp.import_data([p.as_dict()]) # try to import a membership dumb_imp = DumbMockImporter() memimp = MembershipImporter('fnd-jid', person_imp, dumb_imp, dumb_imp) with pytest.raises(NoMembershipsError): memimp.import_data([])
def test_save_object_invalid(): s = Scraper('jurisdiction', '/tmp/') p = Person('Michael Jordan') # no source, won't validate with pytest.raises(ValueError): s.save_object(p)
def test_committee_add_member_person(): c = Organization('Defense', classification='committee') p = Person('John Adams') c.add_member(p, role='chairman') assert c._related[0].person_id == p._id assert c._related[0].organization_id == c._id assert c._related[0].role == 'chairman'
def scrape_senator_page(self, chamber, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) for legislator in page.xpath( "//div[@id='senators']//div[contains(concat(' ', normalize-space(@class), ' '), " "' portraitContainer ')]"): img = legislator.xpath(".//div[@class='profileThumbnailBoundingBox']/@style")[0] img = img[img.find('(')+1:img.find(')')] full_name = legislator.xpath(".//div[@class='profileName']/a/text()")[0] homepage_url = legislator.xpath(".//a[@class='profileImageLink']")[0].attrib['href'] district = legislator.xpath(".//div[@class='profileDistrict']" "/a/text()")[0].split("#")[1] if "Vacant" in full_name: continue homepage = self.get(homepage_url).text page = lxml.html.fromstring(homepage) phone = page.xpath("//div[@class='phone']/span/text()")[0] address_lines = page.xpath("//div[@class='address']/span/text()") address = "\n".join(address_lines) party_image = page.xpath('//div[@class="senatorParty"]/img/@src')[0] if 'Republican' in party_image: party = 'Republican' elif 'Democrat' in party_image: party = 'Democratic' email = ( 'rep{0:0{width}}@ohiohouse.gov' if chamber == 'lower' else 'sd{0:0{width}}@ohiosenate.gov' ).format(int(district), width=2) leg = Person(name=full_name, district=district, primary_org=chamber, image=img, party=party) leg.add_contact_detail(type='address', value=address, note='Capitol Office') leg.add_contact_detail(type='voice', value=phone, note='Capitol Office') leg.add_contact_detail(type='email', value=email, note='Capitol Office') leg.add_source(url) leg.add_link(homepage_url) yield leg
def test_full_vote_event(): j = create_jurisdiction() j.legislative_sessions.create(name='1900', identifier='1900') sp1 = ScrapePerson('John Smith', primary_org='lower') sp2 = ScrapePerson('Adam Smith', primary_org='lower') org = ScrapeOrganization(name='House', classification='lower') bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', from_organization=org._id) vote_event = ScrapeVoteEvent(legislative_session='1900', motion_text='passage', start_date='1900-04-01', classification='passage:bill', result='pass', bill_chamber='lower', bill='HB 1', organization=org._id) vote_event.set_count('yes', 20) vote_event.yes('John Smith') vote_event.no('Adam Smith') oi = OrganizationImporter('jid') oi.import_data([org.as_dict()]) pi = PersonImporter('jid') pi.import_data([sp1.as_dict(), sp2.as_dict()]) mi = MembershipImporter('jid', pi, oi, DumbMockImporter()) mi.import_data([sp1._related[0].as_dict(), sp2._related[0].as_dict()]) bi = BillImporter('jid', oi, pi) bi.import_data([bill.as_dict()]) VoteEventImporter('jid', pi, oi, bi).import_data([vote_event.as_dict()]) assert VoteEvent.objects.count() == 1 ve = VoteEvent.objects.get() assert ve.legislative_session == LegislativeSession.objects.get() assert ve.motion_classification == ['passage:bill'] assert ve.bill == Bill.objects.get() count = ve.counts.get() assert count.option == 'yes' assert count.value == 20 votes = list(ve.votes.all()) assert len(votes) == 2 for v in ve.votes.all(): if v.voter_name == 'John Smith': assert v.option == 'yes' assert v.voter == Person.objects.get(name='John Smith') else: assert v.option == 'no' assert v.voter == Person.objects.get(name='Adam Smith')
def test_multiple_memberships(): # there was a bug where two or more memberships to the same jurisdiction # would cause an ORM error, this test ensures that it is fixed p = Person.objects.create(name='Dwayne Johnson') o = Organization.objects.create(name='WWE', jurisdiction_id='jurisdiction-id') Membership.objects.create(person=p, organization=o) o = Organization.objects.create(name='WWF', jurisdiction_id='jurisdiction-id') Membership.objects.create(person=p, organization=o) person = ScrapePerson('Dwayne Johnson') pd = person.as_dict() PersonImporter('jurisdiction-id').import_data([pd]) # deduplication should still work assert Person.objects.all().count() == 1
def test_no_membership_for_person(): Organization.objects.create(id="fnd", name="Foundation", classification="foundation", jurisdiction_id="fnd-jid") # import a person with no memberships p = ScrapePerson('a man without a country') person_imp = PersonImporter('fnd-jid') person_imp.import_data([p.as_dict()]) # try to import a membership dumb_imp = DumbMockImporter() memimp = MembershipImporter('fnd-jid', person_imp, dumb_imp, dumb_imp) with pytest.raises(NoMembershipsError): memimp.import_data([])
def scrape(self): urls = Urls(dict(list=legislators_url), self) council = Organization('Temecula City Council', classification='legislature') council.add_source(urls.list.url) yield council for tr in urls.list.xpath('//table[2]//tr')[1:]: # Parse some attributes. name, role = tr.xpath('td/p[1]//font/text()') image = tr.xpath('td/img/@src').pop() # Create legislator. person = Person(name, image=image) # Add membership on council. memb = person.add_membership(council, role=role) # Add email address. email, detail_url = tr.xpath('td//a/@href') email = email[7:] memb.contact_details.append( dict(type='email', value=email, note='work')) # Add sources. person.add_source(urls.list.url) person.add_source(detail_url) yield person
def scrape_upper_chamber(self, term): url = 'https://senado.pr.gov/Pages/Senadores.aspx' doc = self.lxmlize(url) links = self.get_nodes(doc, '//ul[@class="senadores-list"]/li/a/@href') for link in links: senator_page = self.lxmlize(link) profile_links = self.get_nodes(senator_page, '//ul[@class="profiles-links"]/li') name_text = self.get_node(senator_page, '//span[@class="name"]').text_content().strip() name = re.sub(r'^Hon\.', '', name_text, flags=re.IGNORECASE).strip() party = profile_links[0].text_content().strip() photo_url = self.get_node(senator_page, '//div[@class="avatar"]//img/@src') if profile_links[1].text_content().strip() == "Senador por Distrito": district_text = self.get_node( senator_page, '//div[@class="module-distrito"]//span[@class="headline"]').text_content() district = district_text.replace('DISTRITO', '', 1).replace('\u200b', '').strip() elif profile_links[1].text_content().strip() == "Senador por Acumulación": district = "At-Large" phone_node = self.get_node(senator_page, '//a[@class="contact-data tel"]') phone = phone_node.text_content().strip() email_node = self.get_node(senator_page, '//a[@class="contact-data email"]') email = email_node.text_content().replace('\u200b', '').strip() person = Person(primary_org='upper', district=district, name=name, party=party, image=photo_url) person.add_contact_detail(type='email', value=email, note='Capitol Office') person.add_contact_detail(type='voice', value=phone, note='Capitol Office') person.add_link(link) person.add_source(link) yield person
def scrape_rep(self, url): page = self.get(url).text page = lxml.html.fromstring(page) page.make_links_absolute(url) main = page.xpath('//div[@id="main-info"]')[0] if "Resigned" in main.text_content(): print("Member resigned {}".format(url)) raise StopIteration # don't yield anything name = page.xpath('//div[@class="member-name"]/text()')[0].strip() name = re.sub(r"\s+", " ", name) district_number = page.xpath( '//span[contains(text(), "House District:")]' "/following-sibling::span/text()")[0].strip() # remove anything after first whitespace district_number = re.sub(r"\s.*", "", district_number.strip()) email = None email_content = page.xpath( '//a[./i[contains(@class,"fa-envelope")]]/text()') if email_content and email_content[0].strip(): email = email_content[0].strip() photo_url = page.xpath('//header[@id="home"]/img/@src')[0] party = self.get_rep_table_by_header(page, "Party Affiliation").text.strip() party = _party_map[party[0]] # standardize main_p_text = page.xpath('//div[@id="main-info"]/p/text()') address = [t.strip() for t in main_p_text if t.strip()][0] person = Person( name=name, district=district_number, primary_org="lower", party=party, image=photo_url, ) person.add_contact_detail(type="address", value=address, note="District Office") person.add_contact_detail(type="email", value=email, note="District Office") person.add_source(url) yield person
def handle_list_item(self, item): photo_url = item.xpath('./td[1]/a/img/@src')[0] info_nodes = item.xpath('./td[2]/p/a') name_text = info_nodes[0].xpath('./b/text()')[0] url = info_nodes[0].get('href') name_match = re.match(r'^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$', name_text) name = name_match.group(1).strip() district = name_match.group(2).lstrip('0').upper() party_text = name_match.group(3) party = PARTIES[party_text] info_texts = [ x.strip() for x in item.xpath( './td[2]/p/text()[normalize-space() and preceding-sibling::br]' ) if x.strip() ] address = '\n'.join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_node = info_nodes[1] email_text = email_node.text email_text = email_text.replace('Email: ', '').strip() if validate_email_address(email_text): email = email_text rep = Person(name=name, district=district, party=party, primary_org='lower', role='Representative', image=photo_url) rep.add_link(url) rep.add_contact_detail(type='address', value=address, note='capitol') rep.add_contact_detail(type='voice', value=phone, note='capitol') rep.add_contact_detail(type='email', value=email, note='capitol') rep.add_source(self.url) yield rep
def handle_list_item(self, item): photo_url = item.xpath("./img/@src")[0] url = item.xpath(".//h5/a/@href")[0] name_text = item.xpath(".//h5/a/b/text()")[0] name_match = re.match(r"^(.+)\(([0-9]{2}[AB]), ([A-Z]+)\)$", name_text) name = name_match.group(1).strip() district = name_match.group(2).lstrip("0").upper() party_text = name_match.group(3) party = PARTIES[party_text] info_texts = [ x.strip() for x in item.xpath("./div/text()[normalize-space()]") if x.strip() ] address = "\n".join((info_texts[0], info_texts[1])) phone_text = info_texts[2] if validate_phone_number(phone_text): phone = phone_text email_text = item.xpath(".//a/@href")[1].replace("mailto:", "").strip() if validate_email_address(email_text): email = email_text rep = Person( name=name, district=district, party=party, primary_org="lower", role="Representative", image=photo_url, ) rep.add_link(url) rep.add_contact_detail(type="address", value=address, note="capitol") rep.add_contact_detail(type="voice", value=phone, note="capitol") rep.add_contact_detail(type="email", value=email, note="capitol") rep.add_source(self.url) yield rep
def test_multiple_orgs_of_same_class(): """ We should be able to set memberships on organizations with the same classification within the same jurisdictions """ create_jurisdiction() Organization.objects.create(id="fnd", name="Foundation", classification="foundation", jurisdiction_id="fnd-jid") Organization.objects.create(id="fdr", name="Federation", classification="foundation", jurisdiction_id="fnd-jid") hari = ScrapePerson('Hari Seldon', primary_org='foundation', role='founder', primary_org_name='Foundation') picard = ScrapePerson('Jean Luc Picard', primary_org='foundation', role='founder', primary_org_name='Federation') person_imp = PersonImporter('fnd-jid') person_imp.import_data([hari.as_dict()]) person_imp.import_data([picard.as_dict()]) # try to import a membership org_imp = OrganizationImporter('fnd-jid') dumb_imp = DumbMockImporter() memimp = MembershipImporter('fnd-jid', person_imp, org_imp, dumb_imp) memimp.import_data( [hari._related[0].as_dict(), picard._related[0].as_dict()]) assert Person.objects.get( name='Hari Seldon').memberships.get().organization.name == 'Foundation' assert Person.objects.get(name='Jean Luc Picard').memberships.get( ).organization.name == 'Federation'
def parse_row(self, row, chamber): print(row) display = '{} {}'.format(row['First Name'], row['Surname']) # TODO: map state to ocd # TODO: https://www.aph.gov.au/Senators_and_Members/Senators/Senators_by_service_expiry_date person = Person( name=display, district=row['State'], role='member', primary_org=chamber, gender=row['Gender'].lower(), party=row['Political Party'], ) person.extras['given_name'] = row['First Name'] person.extras['family_name'] = row['Surname'] return person
def scrape_member(self, chamber, link): name = link.text.strip() leg_url = link.get("href") district = link.xpath("string(../../td[3])") party = link.xpath("string(../../td[4])") # we get email on the next page now # email = link.xpath("string(../../td[5])") if party == "Democrat": party = "Democratic" elif party == "No Party Specified": party = "Independent" pid = re.search(r"personID=(\d+)", link.attrib["href"]).group(1) photo_url = ("https://www.legis.iowa.gov/photo" "?action=getPhoto&ga=%s&pid=%s" % (self.latest_session(), pid)) leg = Person( name=name, primary_org=chamber, district=district, party=party, image=photo_url, ) leg.add_link(leg_url) leg.add_source(leg_url) leg_page = lxml.html.fromstring(self.get(link.attrib["href"]).text) self.scrape_member_page(leg, leg_page) yield leg
def scrape_upper_chamber(self, term): url = "http://oksenate.gov/Senators/Default.aspx" html = self.get(url).text doc = lxml.html.fromstring(html) doc.make_links_absolute(url) for a in doc.xpath('//table[@summary]')[0]. \ xpath('.//td//a[contains(@href, "biographies")]'): tail = a.xpath('..')[0].tail if tail: district = tail.split()[1] else: district = a.xpath('../../span')[1].text.split()[1] if a.text is None or a.text.strip() == 'Vacant': self.warning( "District {} appears to be empty".format(district)) continue else: match = re.match(r'(.+) \(([A-Z])\)', a.text.strip()) name, party = match.group(1), self._parties[match.group(2)] url = a.get('href') person = Person( primary_org='upper', district=district, name=name.strip(), party=party, ) person.add_link(url) person.add_source(url) self.scrape_upper_offices(person, url) yield person
def scrape_lower_legislator(self, url, leg_info): page = self.lxmlize(url) name = page.xpath( '//span[@id="body_FormView5_FULLNAMELabel"]/text()')[0].strip() if name.startswith("District ") or name.startswith("Vacant "): self.warning("Seat is vacant: {}".format(name)) return photo = page.xpath( '//img[contains(@src, "/h_reps/RepPics")]')[0].attrib["src"] party_flags = { "Democrat": "Democratic", "Republican": "Republican", "Independent": "Independent", } party_info = page.xpath( '//span[@id="body_FormView5_PARTYAFFILIATIONLabel"]/text()' )[0].strip() party = party_flags[party_info] try: email = page.xpath( '//span[@id="body_FormView6_EMAILADDRESSPUBLICLabel"]/text()' )[0].strip() except IndexError: email = None district = leg_info["dist"].replace("Dist", "").strip() person = Person(name=name, party=party, district=district, primary_org="lower", image=photo) contacts = [ (leg_info["office"], "address"), (leg_info["phone"], "voice"), (email, "email"), ] for value, key in contacts: if value: person.add_contact_detail(type=key, value=value, note="District Office") person.add_source(url) person.add_link(url) yield person
def test_no_membership_for_person_including_party(): """ even though party is specified we should still get a no memberships error because it doesn't bind the person to a jurisdiction, thus causing duplication """ create_jurisdiction() Organization.objects.create(id="fnd", name="Foundation", classification="foundation", jurisdiction_id="fnd-jid") Organization.objects.create(id="dem", name="Democratic", classification="party") # import a person with no memberships p = ScrapePerson('a man without a country', party='Democratic') person_imp = PersonImporter('fnd-jid') org_imp = OrganizationImporter('fnd-jid') person_imp.import_data([p.as_dict()]) # try to import a membership dumb_imp = DumbMockImporter() memimp = MembershipImporter('fnd-jid', person_imp, org_imp, dumb_imp) with pytest.raises(NoMembershipsError): memimp.import_data([p._related[0].as_dict()])
def scrape(self): # lower url = 'http://164.100.47.194/Loksabha/Members/AlphabeticalList.aspx' entry = self.get(url).content page = lxml.html.fromstring(entry) page.make_links_absolute(url) for tr in page.xpath( '//table[contains(@class,"member_list_table")]/tr'): name = tr.xpath('td[2]/a[1]/@title')[0] bio = tr.xpath('td[2]/a[1]/@href')[0] photo_url = tr.xpath('td[2]/a[1]/img/@src')[0] party = tr.xpath('td[3]/text()')[0].strip() state = tr.xpath('td[4]/text()')[0].strip() member = Person(name=name, role="member", primary_org="lower", party=party, image=photo_url, district=state) member.add_source('http://164.100.47.5/Newmembers/memberlist.aspx') yield member # upper url = 'http://164.100.47.5/Newmembers/memberlist.aspx' entry = self.get(url).content page = lxml.html.fromstring(entry) page.make_links_absolute(url) for tr in page.xpath( '//table[@id="ContentPlaceHolder1_GridView2"]/tr')[1:]: name = tr.xpath('td[2]/font/a/text()')[0] party_abbr = tr.xpath('td[3]/font/text()')[0].strip() state = tr.xpath('td[4]/font/text()')[0].strip() member = Person(name=name, role="member", primary_org="upper", party=party_abbr, district=state) member.add_source('http://164.100.47.5/Newmembers/memberlist.aspx') yield member
def test_person_add_membership(): p = Person('Bob B. Bear') p.add_source('http://example.com') o = Organization('test org') p.add_membership(o, 'member', start_date='2007') assert len(p._related) == 1 p._related[0].validate() assert p._related[0].person_id == p._id assert p._related[0].organization_id == o._id assert p._related[0].start_date == '2007'
def test_person_add_membership_org(): p = Person('Bob B. Bear') p.add_source('http://example.com') o = Organization('test org', classification='unknown') p.add_membership(o, role='member', start_date='2007', end_date=datetime.date(2015, 5, 8)) assert len(p._related) == 1 p._related[0].validate() assert p._related[0].person_id == p._id assert p._related[0].organization_id == o._id assert p._related[0].start_date == '2007' assert p._related[0].end_date == datetime.date(2015, 5, 8)
def test_same_name_people_other_name(): create_jurisdiction() # ensure we're taking other_names into account for the name collision code Organization.objects.create(name='WWE', jurisdiction_id='jid') p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1') p2 = ScrapePerson('Rock', image='http://example.com/2') p2.add_name('Dwayne Johnson') # the people have the same name but are apparently different with pytest.raises(SameNameError): PersonImporter('jid').import_data([p1.as_dict(), p2.as_dict()])
def table_row_to_legislator_and_profile_url(table_row_element, chamber): """Derive a Legislator from an HTML table row lxml Element, and a link to their profile""" td_elements = table_row_element.xpath("td") ( role_element, name_element, district_element, party_element, phone_element, email_element, ) = td_elements # Name comes in the form Last, First # last_name_first_name = name_element.text_content().strip() # full_name = last_name_first_name_to_full_name(last_name_first_name) full_name = name_element.text_content().strip() if full_name.count(", ") == 1: full_name = " ".join(full_name.split(", ")[::-1]).strip() district = district_element.text_content().strip() party = party_element.text_content().strip() if party == "Democrat": party = "Democratic" elif party == "Unaffiliated": party = "Independent" role = role_element.text_content().strip() address = co_address_from_role(role) phone = phone_element.text_content().strip() email = email_element.text_content().strip() (profile_url,) = name_element.xpath("a/@href") print(chamber, district, party) legislator = Person( primary_org=chamber, name=full_name, district=district, party=party ) legislator.add_contact_detail(type="address", value=address, note="Capitol Office") if phone: legislator.add_contact_detail(type="voice", value=phone, note="Capitol Office") if email: legislator.add_contact_detail(type="email", value=email, note="Capitol Office") return legislator, profile_url
def test_same_name_second_import(): create_jurisdiction() # ensure two people with the same name don't import without birthdays o = Organization.objects.create(name='WWE', jurisdiction_id='jid') p1 = ScrapePerson('Dwayne Johnson', image='http://example.com/1') p2 = ScrapePerson('Dwayne Johnson', image='http://example.com/2') p1.birth_date = '1970' p2.birth_date = '1930' # when we give them birth dates all is well though PersonImporter('jid').import_data([p1.as_dict(), p2.as_dict()]) # fake some memberships so future lookups work on these people for p in Person.objects.all(): Membership.objects.create(person=p, organization=o) p3 = ScrapePerson('Dwayne Johnson', image='http://example.com/3') with pytest.raises(SameNameError): PersonImporter('jid').import_data([p3.as_dict()])
def test_bill_sponsor_limit_lookup(): create_jurisdiction() org = create_org() bill = ScrapeBill('HB 1', '1900', 'Axe & Tack Tax Act', classification='tax bill', chamber='lower') bill.add_sponsorship_by_identifier(name="SNODGRASS", classification='sponsor', entity_type='person', primary=True, identifier="TOTALLY_REAL_ID", scheme="TOTALLY_REAL_SCHEME") oi = OrganizationImporter('jid') pi = PersonImporter('jid') zs = ScrapePerson(name='Zadock Snodgrass', birth_date="1800-01-01") zs.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') pi.import_data([zs.as_dict()]) za_db = Person.objects.get() Membership.objects.create(person_id=za_db.id, organization_id=org.id) zs2 = ScrapePerson(name='Zadock Snodgrass', birth_date="1900-01-01") zs2.add_identifier(identifier='TOTALLY_REAL_ID', scheme='TOTALLY_REAL_SCHEME') # This is contrived and perhaps broken, but we're going to check this. # We *really* don't want to *ever* cross jurisdiction bounds. PersonImporter('another-jurisdiction').import_data([zs.as_dict()]) BillImporter('jid', oi, pi).import_data([bill.as_dict()]) obj = Bill.objects.get() (entry, ) = obj.sponsorships.all() assert entry.person.name == "Zadock Snodgrass" assert entry.person.birth_date == "1800-01-01"
def handle_list_item(self, item): name = item.text if 'resigned' in name.lower() or 'vacated' in name.lower(): return if (name in CHAMBER_MOVES and (self.chamber != CHAMBER_MOVES[name])): return name, action, date = clean_name(name) leg = Person(name=name) leg.add_source(self.url) leg.add_source(item.get('href')) leg.add_link(item.get('href')) yield from self.scrape_page( self.detail_page, item.get('href'), session=self.kwargs['session'], obj=leg, ) yield leg
def table_row_to_legislator_and_profile_url(table_row_element, chamber): """Derive a Legislator from an HTML table row lxml Element, and a link to their profile""" td_elements = table_row_element.xpath('td') (role_element, name_element, district_element, party_element, phone_element, email_element) = td_elements # Name comes in the form Last, First # last_name_first_name = name_element.text_content().strip() # full_name = last_name_first_name_to_full_name(last_name_first_name) full_name = name_element.text_content().strip() district = district_element.text_content().strip() party = party_element.text_content().strip() if party == 'Democrat': party = 'Democratic' role = role_element.text_content().strip() address = co_address_from_role(role) phone = phone_element.text_content().strip() email = email_element.text_content().strip() (profile_url, ) = name_element.xpath('a/@href') print(chamber, district, party) legislator = Person(primary_org=chamber, name=full_name, district=district, party=party) legislator.add_contact_detail(type='address', value=address, note='Capitol Office') if phone: legislator.add_contact_detail(type='voice', value=phone, note='Capitol Office') if email: legislator.add_contact_detail(type='email', value=email, note='Capitol Office') return legislator, profile_url
def handle_list_item(self, item): name = item.text lname = name.lower() if "resigned" in lname or "vacated" in lname or "retired" in lname: return if name in CHAMBER_MOVES and (self.chamber != CHAMBER_MOVES[name]): return name, action, date = clean_name(name) leg = Person(name=name) leg.add_source(self.url) leg.add_source(item.get("href")) leg.add_link(item.get("href")) yield from self.scrape_page( self.detail_page, item.get("href"), session=self.kwargs["session"], committees=self.kwargs["committees"], obj=leg, ) yield leg
def handle_list_item(self, item): name = " ".join(item.xpath('.//text()')) name = re.sub(r'\s+', " ", name).replace(" ,", ",").strip() if 'Vacant' in name: return district = item.xpath("string(../../td[1])") party = item.xpath("string(../../td[2])") if party == 'Democrat': party = 'Democratic' leg_url = item.get('href') name = fix_name(name) leg = Person(name=name, district=district, party=party, primary_org='upper', role='Senator') leg.add_link(leg_url) leg.add_source(self.url) leg.add_source(leg_url) self.scrape_page(SenDetail, leg_url, obj=leg) return leg