def scrape(self): sessions = reversed(self.jurisdiction.legislative_sessions) committee_term_instances = committees_from_sessions(self, sessions) committees_by_code = build_lookup_dict(self, data_list=committee_term_instances, index_key='code') for code, instances in committees_by_code.items(): # TODO: Figure out how to edit city council org. if code == 'CC': continue extras = {'tmmis_decision_body_ids': []} for i, inst in enumerate(instances): # TODO: Ensure this survives addition of new term (2017) # so specific year always creates canonical_i = 0 if i == canonical_i: o = Organization(name=inst['name'], classification='committee') extras.update({'description': inst['info']}) o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME) extras['tmmis_decision_body_ids'].append({inst['term']: inst['decision_body_id']}) o.extras = extras o.add_source(inst['source_url']) if instances[canonical_i]['name'] != inst['name']: # TODO: Add start_date and end_date o.add_name(inst['name']) yield o
def scrape(self): sessions = reversed(self.jurisdiction.legislative_sessions) committee_term_instances = committees_from_sessions(self, sessions) committees_by_code = build_lookup_dict( self, data_list=committee_term_instances, index_key='code') for code, instances in committees_by_code.items(): # TODO: Figure out how to edit city council org. if code == 'CC': continue extras = {'tmmis_decision_body_ids': []} for i, inst in enumerate(instances): # TODO: Ensure this survives addition of new term (2017) # so specific year always creates canonical_i = 0 if i == canonical_i: o = Organization(name=inst['name'], classification='committee') extras.update({'description': inst['info']}) o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME) extras['tmmis_decision_body_ids'].append( {inst['term']: inst['decision_body_id']}) o.extras = extras o.add_source(inst['source_url']) if instances[canonical_i]['name'] != inst['name']: # TODO: Add start_date and end_date o.add_name(inst['name']) yield o
def test_extras_organization(): org = ScrapeOrganization('United Nations', classification='international') org.extras = {"hello": "world", "foo": {"bar": "baz"}} od = org.as_dict() OrganizationImporter('jurisdiction-id').import_data([od]) o = Organization.objects.get() assert o.extras['foo']['bar'] == 'baz'
def scrape(self): sessions = reversed(self.jurisdiction.legislative_sessions) committee_term_instances = committees_from_sessions(self, sessions) committees_by_code = build_lookup_dict( self, data_list=committee_term_instances, index_key='code') for code, instances in committees_by_code.items(): # TODO: Figure out how to edit city council org. if code == 'CC': continue # When there are no meetings scheduled and was no way to deduce committee code. if not code: continue extras = {'tmmis_decision_body_ids': []} for i, inst in enumerate(instances): # TODO: Ensure this survives addition of new term (2017) # so specific year always creates canonical_i = 0 if i == canonical_i: o = Organization(name=inst['name'], classification='committee') extras.update({'description': inst['info']}) o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME) # TODO: Scrape non-councillor members meeting_id = self.referenceMeetingId( inst['code'], inst['term']) if meeting_id: seen_posts = [] membership_url = MEMBERSHIP_URL_TEMPLATE.format( meeting_id) for councillor in self.councillorMembers( membership_url): o.add_member(councillor['name'], councillor['role']) if councillor['role'] not in seen_posts: # TODO: More specific divisions for some committee? o.add_post( role=councillor['role'], label=councillor['role'], division_id=self.jurisdiction.division_id) seen_posts.append(councillor['role']) extras['tmmis_decision_body_ids'].append( {inst['term']: inst['decision_body_id']}) o.extras = extras o.add_source(inst['source_url']) if instances[canonical_i]['name'] != inst['name']: # TODO: Add start_date and end_date o.add_name(inst['name']) yield o
def scrape(self): sessions = reversed(self.jurisdiction.legislative_sessions) committee_term_instances = committees_from_sessions(self, sessions) committees_by_code = build_lookup_dict(self, data_list=committee_term_instances, index_key='code') for code, instances in committees_by_code.items(): # TODO: Figure out how to edit city council org. if code == 'CC': continue # When there are no meetings scheduled and was no way to deduce committee code. if not code: continue extras = {'tmmis_decision_body_ids': []} for i, inst in enumerate(instances): # TODO: Ensure this survives addition of new term (2017) # so specific year always creates canonical_i = 0 if i == canonical_i: o = Organization(name=inst['name'], classification='committee') extras.update({'description': inst['info']}) o.add_identifier(inst['code'], scheme=TWO_LETTER_ORG_CODE_SCHEME) # TODO: Scrape non-councillor members meeting_id = self.referenceMeetingId(inst['code'], inst['term']) if meeting_id: seen_posts = [] membership_url = MEMBERSHIP_URL_TEMPLATE.format(meeting_id) for councillor in self.councillorMembers(membership_url): o.add_member(councillor['name'], councillor['role']) if councillor['role'] not in seen_posts: o.add_post( role=councillor['role'], label=councillor['role'], # TODO: More specific divisions for some committee? division_id=self.jurisdiction.division_id, ) seen_posts.append(councillor['role']) extras['tmmis_decision_body_ids'].append({inst['term']: inst['decision_body_id']}) o.extras = extras o.add_source(inst['source_url']) if instances[canonical_i]['name'] != inst['name']: # TODO: Add start_date and end_date o.add_name(inst['name']) yield o
def scrape_committee(self, comm_num): url = self.committee_url(comm_num) page = self.lxmlize(url) # get title comm_name = page.xpath("//h1/text()")[0] # create object comm = Organization(name=comm_name, classification="committee", chamber="legislature") comm.add_source(url=url) # add posts comm.add_post(label="chair", role="chair") # FIXME do we need a separate post for each member? # FIXME is member an appropriate name? comm.add_post(label="member", role="member") # helper for finding other nodes landmark_node = page.xpath("//h2[text()='Committee Members']")[0] # add memberships member_names = landmark_node.xpath( "following-sibling::div/ul/li/a/text()") fl_names = [HumanName.name_firstandlast(name) for name in member_names] print("My attempt to scrub people's names:", list(zip(member_names, fl_names))) chair_name, *other_names = fl_names if chair_name not in {'Lewis Reed'}: comm.add_member(chair_name, role="chair") for name in other_names: if name not in {'Lewis Reed'}: comm.add_member(name, role="member") # add description about_node = page.xpath("//h2[text()='About']")[0] (description, ) = about_node.xpath( "parent::div//div[@class='content-block']/p[2]/text()") description = description.strip() comm.extras = {"description": description} yield comm
def scrape_committee(self, comm_num): url = self.committee_url(comm_num) page = self.lxmlize(url) # get title comm_name = page.xpath("//h1/text()")[0] # create object comm = Organization(name=comm_name, classification="committee", chamber="legislature") comm.add_source(url=url) # add posts comm.add_post(label="chair", role="chair") # FIXME do we need a separate post for each member? # FIXME is member an appropriate name? comm.add_post(label="member", role="member") # helper for finding other nodes landmark_node = page.xpath("//h2[text()='Committee Members']")[0] # add memberships member_names = landmark_node.xpath("following-sibling::div/ul/li/a/text()") fl_names = [HumanName.name_firstandlast(name) for name in member_names] print("My attempt to scrub people's names:", list(zip(member_names, fl_names))) chair_name, *other_names = fl_names if chair_name not in {'Lewis Reed'} : comm.add_member(chair_name, role="chair") for name in other_names: if name not in {'Lewis Reed'} : comm.add_member(name, role="member") # add description about_node = page.xpath("//h2[text()='About']")[0] (description, ) = about_node.xpath("parent::div//div[@class='content-block']/p[2]/text()") description = description.strip() comm.extras = {"description": description} yield comm
def scrape_session(self, session, chambers): sid = SESSION_SITE_IDS[session] committees = backoff(self.cservice.GetCommitteesBySession, sid) # if committees.strip() == "": # return # If we get here, it's a problem. # Commenting this out for future debugging. - PRT if str(committees).strip() == "": raise ValueError("Error: No committee data for sid: %s" % (sid)) committees = committees['CommitteeListing'] for committee in committees: cid = committee['Id'] committee = backoff(self.cservice.GetCommittee, cid) subctty_cache = {} comname, typ, guid, code, description = [committee[x] for x in [ 'Name', 'Type', 'Id', 'Code', 'Description' ]] comchamber = { "House": "lower", "Senate": "upper", "Joint": "joint" }[typ] ctty_key = '{}-{}'.format(typ, code) if ctty_key not in self.ctty_cache: ctty = Organization(chamber=comchamber, name=comname, classification='committee') ctty.extras = { 'code': code, 'guid': guid, 'description': description, } self.ctty_cache[ctty_key] = ctty members = committee['Members']['CommitteeMember'] for member in members: name = "{First} {Last}".format(**dict(member['Member']['Name'])) role = member['Role'] membership = ctty.add_member(name, role) membership.extras = {'guid': member['Member']['Id']} subcoms = member['SubCommittees'] or [] for subcom in subcoms: subcom = subcom[1][0] subguid = subcom['Id'] subcommittee = subcom['Name'] if subcommittee in subctty_cache: # Add member to existing subcommittee. subctty = subctty_cache[subcommittee] else: # Create subcommittee. subctty = Organization( name=subcommittee, classification='committee', parent_id={'classification': comchamber, 'name': comname}) subctty.extras = { 'guid': subguid, } subctty.add_source(self.csource) subctty.add_source(CTTIE_URL.format(**{ "sid": sid, "cttie": guid, })) subctty_cache[subcommittee] = subctty membership = subctty.add_member(name, role) membership.extras = {'guid': member['Member']['Id']} for subctty in subctty_cache.values(): yield subctty ctty.add_source(self.csource) ctty.add_source(CTTIE_URL.format(**{ "sid": sid, "cttie": guid, })) yield ctty
def _scrape_lower_chamber(self, session): self.info('Scraping lower chamber for committees.') chamber = 'lower' url = '{base}ActiveCommittees.aspx'.format(base=self._reps_url_base) page_string = self.get(url).text page = lxml.html.fromstring(page_string) table = page.xpath('//div[@class="lightened"]/table[1]')[0] # Last tr has the date trs = table.xpath('tr')[:-1] for tr in trs: committee_parts = [part.strip() for part in tr.text_content().split(',')] committee_name = committee_parts[0].title().strip() if len(committee_parts) > 0: status = committee_parts[1].strip() committee_url = tr.xpath('td/a')[0].attrib.get('href') committee_url = '{base}{url}'.format(base=self._reps_url_base, url=committee_url) actual_chamber = chamber if 'joint' in committee_name.lower(): actual_chamber = 'joint' committee_name = committee_name.replace('Committee On ', '') committee_name = committee_name.replace('Special', '') committee_name = committee_name.replace('Select', '') committee_name = committee_name.replace('Special', '') committee_name = committee_name.replace('Joint', '') committee_name = committee_name.replace(' Committee', '') committee_name = committee_name.strip() committee = Organization( committee_name, chamber=actual_chamber, classification='committee', ) committee.extras = {'status': status} committee_page_string = self.get(committee_url).text committee_page = lxml.html.fromstring(committee_page_string) # First tr has the title (sigh) mem_trs = committee_page.xpath('id("memGroup")/tr')[1:] for mem_tr in mem_trs: mem_code = None mem_links = mem_tr.xpath('td/a[1]') if len(mem_links): mem_code = mem_links[0].attrib.get('href') # Output is "Rubble, Barney, Neighbor" mem_parts = mem_tr.text_content().strip().split(',') if self._no_members_text in mem_parts: continue mem_name = (mem_parts[1].strip() + ' ' + mem_parts[0].strip()) # Sometimes Senator abbreviation is in the name mem_name = mem_name.replace('Sen. ', '') mem_role = 'member' if len(mem_parts) > 2: # Handle the case where there is a comma in the # role name mem_role = ', '.join( [p.strip() for p in mem_parts[2:]]).lower() membership = committee.add_member(mem_name, role=mem_role) membership.extras = {'code': mem_code} committee.add_source(url) committee.add_source(committee_url) yield committee
def scrape_session(self, session, chambers): sid = SESSION_SITE_IDS[session] committees = backoff(self.cservice.GetCommitteesBySession, sid) # if committees.strip() == "": # return # If we get here, it's a problem. # Commenting this out for future debugging. - PRT if str(committees).strip() == "": raise ValueError("Error: No committee data for sid: %s" % (sid)) committees = committees['CommitteeListing'] for committee in committees: cid = committee['Id'] committee = backoff(self.cservice.GetCommittee, cid) subctty_cache = {} comname, typ, guid, code, description = [ committee[x] for x in ['Name', 'Type', 'Id', 'Code', 'Description'] ] comchamber = { "House": "lower", "Senate": "upper", "Joint": "joint" }[typ] ctty_key = '{}-{}'.format(typ, code) if ctty_key not in self.ctty_cache: ctty = Organization(chamber=comchamber, name=comname, classification='committee') ctty.extras = { 'code': code, 'guid': guid, 'description': description, } self.ctty_cache[ctty_key] = ctty members = committee['Members']['CommitteeMember'] for member in members: name = "{First} {Last}".format( **dict(member['Member']['Name'])) role = member['Role'] membership = ctty.add_member(name, role) membership.extras = {'guid': member['Member']['Id']} subcoms = member['SubCommittees'] or [] for subcom in subcoms: subcom = subcom[1][0] subguid = subcom['Id'] subcommittee = subcom['Name'] if subcommittee in subctty_cache: # Add member to existing subcommittee. subctty = subctty_cache[subcommittee] else: # Create subcommittee. subctty = Organization(name=subcommittee, classification='committee', parent_id={ 'classification': comchamber, 'name': comname }) subctty.extras = { 'guid': subguid, } subctty.add_source(self.csource) subctty.add_source( CTTIE_URL.format(**{ "sid": sid, "cttie": guid, })) subctty_cache[subcommittee] = subctty membership = subctty.add_member(name, role) membership.extras = {'guid': member['Member']['Id']} for subctty in subctty_cache.values(): yield subctty ctty.add_source(self.csource) ctty.add_source(CTTIE_URL.format(**{ "sid": sid, "cttie": guid, })) yield ctty
def transform_parse(self, parsed_form, response): _source = { "url": response.url, "note": "LDA Form LD-1" } # basic disclosure fields _disclosure = Disclosure( effective_date=datetime.strptime( parsed_form['datetimes']['effective_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), timezone='America/New_York', submitted_date=datetime.strptime( parsed_form['datetimes']['signature_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), classification="lobbying" ) _disclosure.add_authority(name=self.authority.name, type=self.authority._type, id=self.authority._id) _disclosure.add_identifier( identifier=parsed_form['_meta']['document_id'], scheme="urn:sopr:filing" ) # disclosure extras _disclosure.extras = {} _disclosure.extras['registrant'] = { 'self_employed_individual': parsed_form['registrant']['self_employed_individual'], 'general_description': parsed_form['registrant']['registrant_general_description'], 'signature': { "signature_date": parsed_form['datetimes']['signature_date'], "signature": parsed_form['signature'] } } _disclosure.extras['client'] = { 'same_as_registrant': parsed_form['client']['client_self'], 'general_description': parsed_form['client']['client_general_description'] } _disclosure.extras['registration_type'] = { 'is_amendment': parsed_form['registration_type']['is_amendment'], 'new_registrant': parsed_form['registration_type']['new_registrant'], 'new_client_for_existing_registrant': parsed_form['registration_type'][ 'new_client_for_existing_registrant'], } # # Registrant # build registrant _registrant_self_employment = None if parsed_form['registrant']['self_employed_individual']: n = ' '.join([p for p in [ parsed_form['registrant']['registrant_individual_prefix'], parsed_form['registrant']['registrant_individual_firstname'], parsed_form['registrant']['registrant_individual_lastname'] ] if len(p) > 0]).strip() _registrant = Person( name=n, source_identified=True ) _registrant_self_employment = Organization( name='SELF-EMPLOYMENT of {n}'.format(n=n), classification='company', source_identified=True ) _registrant.add_membership( organization=_registrant_self_employment, role='self_employed', label='self-employment of {n}'.format(n=n), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: _registrant = Organization( name=parsed_form['registrant']['registrant_org_name'], classification='company', source_identified=True ) if len(parsed_form['registrant']['registrant_house_id']) > 0: _registrant.add_identifier( identifier=parsed_form['registrant']['registrant_house_id'], scheme='urn:house_clerk:registrant' ) if len(parsed_form['registrant']['registrant_senate_id']) > 0: _registrant.add_identifier( identifier=parsed_form['registrant']['registrant_senate_id'], scheme='urn:sopr:registrant' ) registrant_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ parsed_form['registrant']['registrant_address_one'], parsed_form['registrant']['registrant_address_two'], parsed_form['registrant']['registrant_city'], parsed_form['registrant']['registrant_state'], parsed_form['registrant']['registrant_zip'], parsed_form['registrant']['registrant_country']] if len(p) > 0]).strip(), }, { "type": "voice", "note": "contact phone", "value": parsed_form['registrant']['registrant_contact_phone'], }, { "type": "email", "note": "contact email", "value": parsed_form['registrant']['registrant_contact_email'], }, ] registrant_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ parsed_form['registrant']['registrant_ppb_city'], parsed_form['registrant']['registrant_ppb_state'], parsed_form['registrant']['registrant_ppb_zip'], parsed_form['registrant']['registrant_ppb_country']] if len(p) > 0]).strip(), } if registrant_contact_ppb["value"]: registrant_contact_details.append(registrant_contact_ppb) for cd in registrant_contact_details: _registrant.add_contact_detail(**cd) _registrant.extras = { "contact_details_structured": [ { "type": "address", "note": "contact address", "parts": [ { "note": "address_one", "value": parsed_form['registrant'][ 'registrant_address_one'], }, { "note": "address_two", "value": parsed_form['registrant'][ 'registrant_address_two'], }, { "note": "city", "value": parsed_form['registrant'][ 'registrant_city'], }, { "note": "state", "value": parsed_form['registrant'][ 'registrant_state'], }, { "note": "zip", "value": parsed_form['registrant'][ 'registrant_zip'], }, { "note": "country", "value": parsed_form['registrant'][ 'registrant_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": parsed_form['registrant'][ 'registrant_ppb_city'], }, { "note": "state", "value": parsed_form['registrant'][ 'registrant_ppb_state'], }, { "note": "zip", "value": parsed_form['registrant'][ 'registrant_ppb_zip'], }, { "note": "country", "value": parsed_form['registrant'][ 'registrant_ppb_country'], } ], }, ] } # # People # build contact _main_contact = Person( name=parsed_form['registrant']['registrant_contact_name'], source_identified=True ) main_contact_contact_details = [ { "type": "voice", "note": "contact phone", "value": parsed_form['registrant']['registrant_contact_phone'], }, { "type": "email", "note": "contact email", "value": parsed_form['registrant']['registrant_contact_email'], } ] for cd in main_contact_contact_details: _main_contact.add_contact_detail(**cd) if _registrant._type == 'organization': _registrant.add_member( name_or_person=_main_contact, role='main_contact', label='main contact for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: _registrant_self_employment.add_member( name_or_person=_main_contact, role='main_contact', label='main contact for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) # # Client # build client _client = Organization( name=parsed_form['client']['client_name'], classification='company', source_identified=True ) client_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ parsed_form['client']['client_address'], parsed_form['client']['client_city'], parsed_form['client']['client_state'], parsed_form['client']['client_zip'], parsed_form['client']['client_country']] if len(p) > 0]).strip(), }, ] client_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ parsed_form['client']['client_ppb_city'], parsed_form['client']['client_ppb_state'], parsed_form['client']['client_ppb_zip'], parsed_form['client']['client_ppb_country']] if len(p) > 0]).strip(), } if client_contact_ppb["value"]: client_contact_details.append(client_contact_ppb) for cd in client_contact_details: _client.add_contact_detail(**cd) _client.extras = { "contact_details_structured": [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": parsed_form['client']['client_address'], }, { "note": "city", "value": parsed_form['client']['client_city'], }, { "note": "state", "value": parsed_form['client']['client_state'], }, { "note": "zip", "value": parsed_form['client']['client_zip'], }, { "note": "country", "value": parsed_form['client']['client_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": parsed_form['client']['client_ppb_city'], }, { "note": "state", "value": parsed_form['client']['client_ppb_state'], }, { "note": "zip", "value": parsed_form['client']['client_ppb_zip'], }, { "note": "country", "value": parsed_form['client'][ 'client_ppb_country'], } ], }, ], } # Collect Foreign Entities _foreign_entities = [] _foreign_entities_by_name = {} for fe in parsed_form['foreign_entities']: fe_extras = {} fe_name = fe['foreign_entity_name'] # check for name-based duplicates if fe_name in _foreign_entities_by_name: _foreign_entity = _foreign_entities_by_name[fe_name] else: _foreign_entity = Organization( name=fe_name, classification='company', source_identified=True ) # collect contact details foreign_entity_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ fe['foreign_entity_address'], fe['foreign_entity_city'], fe['foreign_entity_state'], fe['foreign_entity_country']] if len(p) > 0]).strip(), }, { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ fe['foreign_entity_ppb_state'], fe['foreign_entity_ppb_country']] if len(p) > 0]).strip(), }, ] foreign_entity_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ fe['foreign_entity_ppb_city'], fe['foreign_entity_ppb_state'], fe['foreign_entity_ppb_country']] if len(p) > 0]), } if foreign_entity_contact_ppb["value"]: foreign_entity_contact_details.append( foreign_entity_contact_ppb) # add contact details for cd in foreign_entity_contact_details: if cd['value'] != '': _foreign_entity.add_contact_detail(**cd) # add extras fe_extras["contact_details_structured"] = [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": fe['foreign_entity_address'], }, { "note": "city", "value": fe['foreign_entity_city'], }, { "note": "state", "value": fe['foreign_entity_state'], }, { "note": "country", "value": fe['foreign_entity_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "state", "value": fe['foreign_entity_ppb_state'], }, { "note": "country", "value": fe['foreign_entity_ppb_country'], } ], }, ] _foreign_entity.extras = combine_dicts(_foreign_entity.extras, fe_extras) _foreign_entities_by_name[fe_name] = _foreign_entity for unique_foreign_entity in _foreign_entities_by_name.values(): _foreign_entities.append(unique_foreign_entity) # TODO: add a variant on memberships to represent inter-org # relationships (associations, ownership, etc) # # _client['memberships'].append({ # "id": _foreign_entity['id'], # "classification": "organization", # "name": _foreign_entity['name'], # "extras": { # "ownership_percentage": # fe['foreign_entity_amount'] # } # }) # Collect Lobbyists # TODO: deal with wierd non-name line continuation cases (blanks, "continued") _lobbyists_by_name = {} for l in parsed_form['lobbyists']: l_extras = {} l_name = ' '.join([l['lobbyist_first_name'], l['lobbyist_last_name'], l['lobbyist_suffix'] ]).strip() if l_name in _lobbyists_by_name: _lobbyist = _lobbyists_by_name[l_name] else: _lobbyist = Person( name=l_name, source_identified=True ) if l['lobbyist_covered_official_position']: l_extras['lda_covered_official_positions'] = [ { 'date_reported': parsed_form['datetimes']['effective_date'], 'covered_official_position': l['lobbyist_covered_official_position'] }, ] _lobbyist.extras = combine_dicts(_lobbyist.extras, l_extras) _lobbyists_by_name[l_name] = _lobbyist _lobbyists = [] for unique_lobbyist in _lobbyists_by_name.values(): _lobbyists.append(unique_lobbyist) if _registrant._type == 'organization': for l in _lobbyists: _registrant.add_member( l, role='lobbyist', label='lobbyist for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) else: for l in _lobbyists: _registrant_self_employment.add_member( l, role='lobbyist', label='lobbyist for {n}'.format(n=_registrant.name), start_date=_disclosure.effective_date.strftime('%Y-%m-%d') ) # # Document # build document _disclosure.add_document( note='submitted filing', date=parsed_form['datetimes']['effective_date'][:10], url=response.url ) # Collect Affiliated orgs _affiliated_organizations = [] _affiliated_organizations_by_name = {} for ao in parsed_form['affiliated_organizations']: ao_extras = {} ao_name = ao['affiliated_organization_name'] if ao_name in _affiliated_organizations_by_name: # There's already one by this name _affiliated_organization = _affiliated_organizations_by_name[ao_name] else: # New affiliated org _affiliated_organization = Organization( name=ao_name, classification='company', source_identified=True ) # collect contact details affiliated_organization_contact_details = [ { "type": "address", "note": "contact address", "value": '; '.join([ p for p in [ ao['affiliated_organization_address'], ao['affiliated_organization_city'], ao['affiliated_organization_state'], ao['affiliated_organization_zip'], ao['affiliated_organization_country']] if len(p) > 0]).strip(), }, ] affiliated_organization_contact_ppb = { "type": "address", "note": "principal place of business", "value": '; '.join([ p for p in [ ao['affiliated_organization_ppb_city'], ao['affiliated_organization_ppb_state'], ao['affiliated_organization_ppb_country']] if len(p) > 0]).strip(), } if affiliated_organization_contact_ppb["value"]: affiliated_organization_contact_details.append( affiliated_organization_contact_ppb) # add contact details for cd in affiliated_organization_contact_details: _affiliated_organization.add_contact_detail(**cd) ao_extras["contact_details_structured"] = [ { "type": "address", "note": "contact address", "parts": [ { "note": "address", "value": ao['affiliated_organization_address'], }, { "note": "city", "value": ao['affiliated_organization_city'], }, { "note": "state", "value": ao['affiliated_organization_state'], }, { "note": "zip", "value": ao['affiliated_organization_zip'], }, { "note": "country", "value": ao['affiliated_organization_country'], } ], }, { "type": "address", "note": "principal place of business", "parts": [ { "note": "city", "value": ao['affiliated_organization_ppb_city'], }, { "note": "state", "value": ao['affiliated_organization_ppb_state'], }, { "note": "country", "value": ao['affiliated_organization_ppb_country'], } ], }, ], _affiliated_organization.extras = combine_dicts( _affiliated_organization.extras, ao_extras) for unique_affiliated_organization in _affiliated_organizations_by_name.values(): _affiliated_organizations.append(unique_affiliated_organization) # # Events & Agendas # name if parsed_form['registration_type']['new_registrant']: registration_type = 'New Client, New Registrant' elif parsed_form['registration_type']['is_amendment']: registration_type = 'Amended Registration' else: registration_type = 'New Client for Existing Registrant' # Create registration event _event = Event( name="{rn} - {rt}, {cn}".format(rn=_registrant.name, rt=registration_type, cn=_client.name), timezone='America/New_York', location='United States', start_time=datetime.strptime( parsed_form['datetimes']['effective_date'], '%Y-%m-%d %H:%M:%S').replace(tzinfo=UTC), classification='registration' ) # add participants _event.add_participant(type=_registrant._type, id=_registrant._id, name=_registrant.name, note="registrant") if _registrant._type == 'person': _event.add_participant(type=_registrant._type, id=_registrant._id, name=_registrant.name, note="registrant") _event.add_participant(type=_client._type, id=_client._id, name=_client.name, note="client") for l in _lobbyists: _event.add_participant(type=l._type, id=l._id, name=l.name, note='lobbyist') for fe in _foreign_entities: _event.add_participant(type=fe._type, id=fe._id, name=fe.name, note='foreign_entity') for ao in _affiliated_organizations: _event.add_participant(type=ao._type, id=ao._id, name=ao.name, note='affiliated_organization') # add agenda item _agenda = _event.add_agenda_item( description='issues lobbied on', ) _agenda['notes'].append( parsed_form['lobbying_issues_detail'] ) for li in parsed_form['lobbying_issues']: if li['general_issue_area'] != '': _agenda.add_subject(li['general_issue_area']) _disclosure.add_disclosed_event( name=_event.name, type=_event._type, classification=_event.classification, id=_event._id ) # add registrant to disclosure's _related and related_entities fields _disclosure.add_registrant(name=_registrant.name, type=_registrant._type, id=_registrant._id) _registrant.add_source( url=_source['url'], note='registrant' ) yield _registrant if _registrant_self_employment is not None: _registrant_self_employment.add_source( url=_source['url'], note='registrant_self_employment' ) yield _registrant_self_employment _client.add_source( url=_source['url'], note='client' ) yield _client _main_contact.add_source( url=_source['url'], note='main_contact' ) yield _main_contact for ao in _affiliated_organizations: ao.add_source( url=_source['url'], note='affiliated_organization' ) yield ao for fe in _foreign_entities: fe.add_source( url=_source['url'], note='foreign_entity' ) yield fe for l in _lobbyists: l.add_source( url=_source['url'], note='lobbyist' ) yield l _event.add_source(**_source) yield _event _disclosure.add_source(**_source) yield _disclosure
def _scrape_lower_chamber(self, session): self.info('Scraping lower chamber for committees.') chamber = 'lower' url = '{base}ActiveCommittees.aspx'.format(base=self._reps_url_base) page_string = self.get(url).text page = lxml.html.fromstring(page_string) table = page.xpath('//div[@class="lightened"]/table[1]')[0] # Last tr has the date trs = table.xpath('tr')[:-1] for tr in trs: committee_parts = [ part.strip() for part in tr.text_content().split(',') ] committee_name = committee_parts[0].title().strip() if len(committee_parts) > 0: status = committee_parts[1].strip() committee_url = tr.xpath('td/a')[0].attrib.get('href') committee_url = '{base}{url}'.format(base=self._reps_url_base, url=committee_url) actual_chamber = chamber if 'joint' in committee_name.lower(): actual_chamber = 'joint' committee_name = committee_name.replace('Committee On ', '') committee_name = committee_name.replace('Special', '') committee_name = committee_name.replace('Select', '') committee_name = committee_name.replace('Special', '') committee_name = committee_name.replace('Joint', '') committee_name = committee_name.replace(' Committee', '') committee_name = committee_name.strip() committee = Organization( committee_name, chamber=actual_chamber, classification='committee', ) committee.extras = {'status': status} committee_page_string = self.get(committee_url).text committee_page = lxml.html.fromstring(committee_page_string) # First tr has the title (sigh) mem_trs = committee_page.xpath('id("memGroup")/tr')[1:] for mem_tr in mem_trs: mem_code = None mem_links = mem_tr.xpath('td/a[1]') if len(mem_links): mem_code = mem_links[0].attrib.get('href') # Output is "Rubble, Barney, Neighbor" mem_parts = mem_tr.text_content().strip().split(',') if self._no_members_text in mem_parts: continue mem_name = (mem_parts[1].strip() + ' ' + mem_parts[0].strip()) # Sometimes Senator abbreviation is in the name mem_name = mem_name.replace('Sen. ', '') mem_role = 'member' if len(mem_parts) > 2: # Handle the case where there is a comma in the # role name mem_role = ', '.join([p.strip() for p in mem_parts[2:]]).lower() membership = committee.add_member(mem_name, role=mem_role) membership.extras = {'code': mem_code} committee.add_source(url) committee.add_source(committee_url) yield committee