def get_organizations(self): exclude_type_ids = getattr(self, 'exclude_type_ids', []) use_type_id = getattr(self, 'use_type_id', False) organization = Organization(self.name, classification=self.classification) parent = Division.get(self.division_id) if parent._type not in ('province', 'territory'): post = Post(role=styles_of_address[self.division_id]['Leader'], label=parent.name, division_id=parent.id, organization_id=organization._id) yield post children = [child for child in parent.children() if child._type != 'place' and child._type not in exclude_type_ids] for child in children: if child: if use_type_id: label = child.id.rsplit('/', 1)[1].capitalize().replace(':', ' ') else: label = child.name post = Post(role=styles_of_address[self.division_id]['Member'], label=label, division_id=child.id, organization_id=organization._id) yield post if not children and parent.attrs['posts_count']: for i in range(1, int(parent.attrs['posts_count'])): # exclude Mayor organization.add_post(role=styles_of_address[self.division_id]['Member'], label='{} (seat {})'.format(parent.name, i), division_id=parent.id) yield organization
def get_organizations(self): organization = Organization(self.name, classification=self.classification) leader_role = styles_of_address[self.division_id]['Leader'] member_role = self.member_role or styles_of_address[self.division_id]['Member'] parent = Division.get(self.division_id) # Don't yield posts for premiers. if parent._type not in ('province', 'territory'): # Yield posts to allow ca_on_toronto to make changes. post = Post(role=leader_role, label=parent.name, division_id=parent.id, organization_id=organization._id) yield post children = [child for child in parent.children() if child._type != 'place' and child._type not in self.exclude_types] for child in children: if not self.skip_null_valid_from and not child.attrs.get('validFrom') or child.attrs.get('validFrom') and (child.attrs['validFrom'] <= datetime.now().strftime('%Y-%m-%d') or child.attrs['validFrom'] == self.valid_from): if self.use_type_id: label = child.id.rsplit('/', 1)[1].capitalize().replace(':', ' ') else: label = child.name # Yield posts to allow ca_on_toronto to make changes. post = Post(role=member_role, label=label, division_id=child.id, organization_id=organization._id) yield post if not children and parent.attrs['posts_count']: for i in range(1, int(parent.attrs['posts_count'])): # exclude Mayor organization.add_post(role=member_role, label='{} (seat {})'.format(parent.name, i), division_id=parent.id) yield organization
def test_deduplication(): Organization.objects.create(id='us', name="United States Executive Branch", classification="executive", jurisdiction_id="us") Organization.objects.create(id='nc', name="North Carolina Executive Branch", classification="executive", jurisdiction_id="nc") pres = ScrapePost(label='executive', role='President', organization_id='~{"classification": "executive"}') vp = ScrapePost(label='vice-executive', role='Vice President', organization_id='~{"classification": "executive"}') gov = ScrapePost(label='executive', role='Governor', organization_id='~{"classification": "executive"}') # ensure pres, vp and gov are all imported # pres & gov - same label, different jurisdiction # vp & pres - same jurisdiction, different label us_oi = OrganizationImporter('us') nc_oi = OrganizationImporter('nc') PostImporter('us', us_oi).import_data([pres.as_dict(), vp.as_dict()]) PostImporter('nc', nc_oi).import_data([gov.as_dict()]) assert Post.objects.count() == 3 # ensure changing the role is allowed pres = ScrapePost(label='executive', role='King', organization_id='~{"classification": "executive"}') PostImporter('us', us_oi).import_data([pres.as_dict()]) # no new object, just an update for role assert Post.objects.count() == 3 assert Post.objects.get(organization_id='us', label='executive').role == 'King'
def test_full_post(): create_jurisdictions() org = Organization.objects.create(name="United States Executive Branch", classification="executive", jurisdiction_id="us") post = ScrapePost(label='executive', role='President', organization_id='~{"classification": "executive"}', start_date=datetime.date(2015, 5, 18), end_date='2015-05-19', maximum_memberships=2 ) post.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') post.add_link('http://example.com/link') # import post oi = OrganizationImporter('us') PostImporter('jurisdiction-id', oi).import_data([post.as_dict()]) print(post.as_dict()) # get person from db and assert it imported correctly p = Post.objects.get() assert 'ocd-post' in p.id assert p.label == post.label assert p.role == post.role assert p.organization_id == org.id assert p.maximum_memberships == 2 assert p.contact_details.all()[0].type == 'phone' assert p.contact_details.all()[0].value == '555-555-1234' assert p.contact_details.all()[0].note == 'this is fake' assert p.links.all()[0].url == 'http://example.com/link' assert p.start_date == '2015-05-18' assert p.end_date == '2015-05-19'
def test_full_post(): org = Organization.objects.create(name="United States Executive Branch", classification="executive", jurisdiction_id="jurisdiction-id") post = ScrapePost(label='executive', role='President', organization_id='~{"classification": "executive"}') post.add_contact_detail(type='phone', value='555-555-1234', note='this is fake') post.add_link('http://example.com/link') # import post oi = OrganizationImporter('jurisdiction-id') PostImporter('jurisdiction-id', oi).import_data([post.as_dict()]) # get person from db and assert it imported correctly p = Post.objects.get() assert 'ocd-post' in p.id assert p.label == post.label assert p.role == post.role assert p.organization_id == org.id assert p.contact_details.all()[0].type == 'phone' assert p.contact_details.all()[0].value == '555-555-1234' assert p.contact_details.all()[0].note == 'this is fake' assert p.links.all()[0].url == 'http://example.com/link'
def test_basic_post(): post = Post(label='1', role='Representative', organization_id='fake_org') assert '1' in str(post) post.validate()
def test_basic_invalid_post(): post = Post(label=1, role='Representative', organization_id='fake_org') with pytest.raises(ValueError): post.validate()
def scrape(self): organizations = {} seat_numbers = defaultdict(lambda: defaultdict(int)) reader = self.csv_reader(self.csv_url, delimiter=self.delimiter, header=True, encoding=self.encoding, skip_rows=self.skip_rows) reader.fieldnames = [ self.header_converter(field) for field in reader.fieldnames ] for row in reader: try: if self.is_valid_row(row): for key, corrections in self.corrections.items(): if not isinstance(corrections, dict): row[key] = corrections(row[key]) elif row[key] in corrections: row[key] = corrections[row[key]] organization_classification = 'legislature' organization_name = row['organization'] organization_key = organization_name.lower() if organization_key in organizations: organization = organizations[organization_key] else: organization = Organization( organization_name, classification=organization_classification) organization.add_source(self.csv_url) yield organization organizations[organization_key] = organization if not row['primary role']: row['primary role'] = 'Councillor' role = row['primary role'] post = Post(role=role, label=organization_name, organization_id=organization._id) yield post name = row['name'].strip(' .,') district = row['district name'] if self.many_posts_per_area and role not in self.unique_roles: seat_numbers[role][district] += 1 district = '{} (seat {})'.format( district, seat_numbers[role][district]) p = Person(primary_org=organization_classification, name=name, district=district, role=role, party=row.get('party name')) p.add_source(self.csv_url) if row.get('gender'): p.gender = row['gender'] if row.get('photo url'): p.image = row['photo url'] if row.get('source url'): p.add_source(row['source url'].strip(' .,')) if row.get('website'): p.add_link(row['website'], note='web site') if row.get('facebook'): p.add_link(re.sub(r'[#?].+', '', row['facebook'])) if row.get('twitter'): p.add_link(row['twitter']) if row['email']: p.add_contact('email', row['email'].strip(' .,')) if row['address']: p.add_contact('address', row['address'], 'legislature') if row.get('phone'): p.add_contact('voice', row['phone'], 'legislature') if row.get('fax'): p.add_contact('fax', row['fax'], 'legislature') if row.get('cell'): p.add_contact('cell', row['cell'], 'legislature') if row.get('birth date'): p.birth_date = row['birth date'] if row.get('incumbent'): p.extras['incumbent'] = row['incumbent'] if name in self.other_names: for other_name in self.other_names[name]: p.add_name(other_name) # Validate person entity so that we can catch the exception if needed. p.validate() yield p except Exception as e: print(repr(e)) continue
def scrape_current_legislators(self, repos): for repo in repos: CURRENT_LEGISLATORS = self.get_url(repo) people = self.yamlize(CURRENT_LEGISLATORS) parties = set() posts = {} person_cache = defaultdict(lambda: defaultdict(lambda: None)) for person in people: name = person['name'].get('official_full') if name is None: name = "{name[first]} {name[last]}".format(**person) if 'birthday' in person['bio']: birth_date = person['bio']['birthday'] who = person_cache[name][birth_date] has_term = False if who is None: who = Person(name=name, birth_date=birth_date) who.add_source(url=CURRENT_LEGISLATORS, note="unitedstates project on GitHub") for term in person.get('terms', []): has_term = True start_date = term['start'] end_date = term['end'] state = term['state'] type_ = term['type'] district = term.get('district', None) party = term.get('party', None) chamber = { 'rep': 'lower', 'sen': 'upper', }[type_] role = { 'rep': 'Representative', 'sen': 'Senator', }[type_] if type_ == "rep" and district is not None: label = "%s for District %s in %s" % (role, district, state) if district == 0: division_id = ( "ocd-division/country:us/state:{state}".format( state=state.lower())) else: division_id = ( "ocd-division/country:us/" "state:{state}/cd:{district}".format( state=state.lower(), district=district)) post = posts.get(division_id) if post is None: post = Post(organization_id={ "rep": self.house, "sen": self.senate }[type_]._id, division_id=division_id, label=label, role=role) posts[division_id] = post yield post membership = Membership(post_id=post._id, role=role, label=label, start_date=start_date, end_date=end_date, person_id=who._id, organization_id={ "rep": self.house, "sen": self.senate, }[type_]._id) yield membership if type_ == "sen": division_id = ( "ocd-division/country:us/state:{state}".format( state=state.lower())) label = "Senitor for %s" % (state) post = posts.get(division_id) if post is None: post = Post(organization_id={ "rep": self.house, "sen": self.senate }[type_]._id, division_id=division_id, label=label, role=role) posts[division_id] = post yield post membership = Membership(post_id=post._id, role=role, label=label, start_date=start_date, end_date=end_date, person_id=who._id, organization_id={ "rep": self.house, "sen": self.senate, }[type_]._id) yield membership if party == "Democrat": party = "Democratic" if party: membership = Membership(role='member', start_date=start_date, end_date=end_date, person_id=who._id, organization_id=make_pseudo_id( classification="party", name=party)) yield membership for key, value in person.get('id', {}).items(): if isinstance(value, list): for v in value: who.add_identifier(str(v), scheme=key) else: who.add_identifier(str(value), scheme=key) if has_term: yield who
def test_basic_post(): post = Post('1', 'Representative', 'fake_org') assert '1' in str(post) post.validate()
def test_basic_invalid_post(): post = Post(1, 'Representative', 'fake_org') with pytest.raises(ValueError): post.validate()