def person_data(self, representative, division_id, division_name, role, organization_name): # Corrections and tweaks. duplicate_names = { 'Colleen Evans', 'Kim Watt-Senner', } name_corrections = { 'Claire l Moglove': 'Claire Moglove', 'KSenya Dorwart': 'Ksenya Dorwart', } email_corrections = { 'sharrison@qualicumbeach,com': '*****@*****.**' } # Get name. representative_name = re.sub( ' +', ' ', str(representative.xpath('a/b/text()')[0]).strip()) representative_name = name_corrections.get(representative_name, representative_name) # Get phone. representative_phone = str( representative.xpath( 'text()[contains(., "Phone")]'))[12:-2].replace('-', '') # Get email. email_scrape = representative.xpath( 'a[contains(@href,"mailto:")]/text()') if email_scrape: representative_email = email_scrape[0] representative_email = email_corrections.get( representative_email, representative_email) # Create record and append contact data. p = Person(primary_org='government', primary_org_name=organization_name, name=representative_name, district=division_name, role=role) p.add_source(LIST_PAGE) # Handle duplicate names. if representative_name in duplicate_names: p.birth_date = str(self.birth_date) self.birth_date += 1 if email_scrape: p.add_contact('email', representative_email) if representative_phone and len(representative_phone) == 10: p.add_contact('voice', representative_phone, 'legislature') p._related[0].extras[ 'boundary_url'] = '/boundaries/census-subdivisions/{}/'.format( division_id.rsplit(':', 1)[1]) return p
def person_data(self, representative, division_id, division_name, role, organization_name): # Corrections and tweaks. duplicate_names = { 'Colleen Evans', 'Kim Watt-Senner', } name_corrections = { 'Claire l Moglove': 'Claire Moglove', 'KSenya Dorwart': 'Ksenya Dorwart', } email_corrections = { 'sharrison@qualicumbeach,com': '*****@*****.**' } # Get name. representative_name = re.sub(' +', ' ', str(representative.xpath('a/b/text()')[0]).strip()) representative_name = name_corrections.get(representative_name, representative_name) # Get phone. representative_phone = str(representative.xpath('text()[contains(., "Phone")]'))[12:-2].replace('-', '') # Get email. representative_email = representative.xpath('a[contains(@href,"mailto:")]/text()')[0] representative_email = email_corrections.get(representative_email, representative_email) # Create record and append contact data. p = Person(primary_org='government', primary_org_name=organization_name, name=representative_name, district=division_name, role=role) p.add_source(LIST_PAGE) # Handle duplicate names. if representative_name in duplicate_names: p.birth_date = str(self.birth_date) self.birth_date += 1 if representative_email: p.add_contact('email', representative_email) if representative_phone and len(representative_phone) == 10: p.add_contact('voice', representative_phone, 'legislature') p._related[0].extras['boundary_url'] = '/boundaries/census-subdivisions/{}/'.format(division_id.rsplit(':', 1)[1]) return p
def scrape(self): organizations = {} seat_numbers = defaultdict(lambda: defaultdict(int)) reader = self.csv_reader(self.csv_url, delimiter=self.delimiter, header=True, encoding=self.encoding, skip_rows=self.skip_rows) reader.fieldnames = [ self.header_converter(field) for field in reader.fieldnames ] for row in reader: try: if self.is_valid_row(row): for key, corrections in self.corrections.items(): if not isinstance(corrections, dict): row[key] = corrections(row[key]) elif row[key] in corrections: row[key] = corrections[row[key]] organization_classification = 'legislature' organization_name = row['organization'] organization_key = organization_name.lower() if organization_key in organizations: organization = organizations[organization_key] else: organization = Organization( organization_name, classification=organization_classification) organization.add_source(self.csv_url) yield organization organizations[organization_key] = organization if not row['primary role']: row['primary role'] = 'Councillor' role = row['primary role'] post = Post(role=role, label=organization_name, organization_id=organization._id) yield post name = row['name'].strip(' .,') district = row['district name'] if self.many_posts_per_area and role not in self.unique_roles: seat_numbers[role][district] += 1 district = '{} (seat {})'.format( district, seat_numbers[role][district]) p = Person(primary_org=organization_classification, name=name, district=district, role=role, party=row.get('party name')) p.add_source(self.csv_url) if row.get('gender'): p.gender = row['gender'] if row.get('photo url'): p.image = row['photo url'] if row.get('source url'): p.add_source(row['source url'].strip(' .,')) if row.get('website'): p.add_link(row['website'], note='web site') if row.get('facebook'): p.add_link(re.sub(r'[#?].+', '', row['facebook'])) if row.get('twitter'): p.add_link(row['twitter']) if row['email']: p.add_contact('email', row['email'].strip(' .,')) if row['address']: p.add_contact('address', row['address'], 'legislature') if row.get('phone'): p.add_contact('voice', row['phone'], 'legislature') if row.get('fax'): p.add_contact('fax', row['fax'], 'legislature') if row.get('cell'): p.add_contact('cell', row['cell'], 'legislature') if row.get('birth date'): p.birth_date = row['birth date'] if row.get('incumbent'): p.extras['incumbent'] = row['incumbent'] if name in self.other_names: for other_name in self.other_names[name]: p.add_name(other_name) # Validate person entity so that we can catch the exception if needed. p.validate() yield p except Exception as e: print(repr(e)) continue
def scrape(self): exclude_divisions = { } exclude_districts = { 'Capital', 'Capital F', 'Capital G', 'Capital H', 'Central Coast B', 'Central Okanagan East', 'Central Okanagan West', 'Comox Valley B', 'Comox Valley C', 'Islands Trust', 'Kitimat-Stikine C', 'Kootenay Boundary B', 'Kootenay Boundary C', 'Kootenay Boundary D', 'Kootenay Boundary E', 'Metro Vancouver A', 'North Coast A', 'North Coast C', 'North Coast D', 'North Coast E', 'Okanagan-Similkameen I', 'Okanagan-Similkameen Olalla Local Community Commission', 'Qathet A', 'Qathet B', 'Qathet C', 'Qathet D', 'Qathet E', } expected_roles = { 'candidate', } infixes = { 'CY': 'City', 'DM': 'District', 'IGD': 'District', 'IM': 'Municipal', 'RGM': 'Regional', 'T': 'Town', 'VL': 'Village', 'RDA': 'District', } duplicate_names = { 'Rick Smith', 'Sung Y Wong', 'Elizabeth Taylor', } names_to_ids = {} for division in Division.get('ocd-division/country:ca').children('csd'): type_id = division.id.rsplit(':', 1)[1] if type_id.startswith('59'): if division.attrs['classification'] == 'IRI': continue if division.name in names_to_ids: names_to_ids[division.name] = None else: names_to_ids[division.name] = division.id reader = self.csv_reader(COUNCIL_PAGE, header=True) reader.fieldnames = [field.lower() for field in reader.fieldnames] organizations = {} birth_date = 1900 seen = set() rows = [row for row in reader] assert len(rows), 'No councillors found' for row in rows: name = row['full name'] district_name = row['district name'] if not any(row.values()) or name.lower() in ('', 'vacant') or district_name in exclude_districts: continue if row['district id']: division_id = 'ocd-division/country:ca/csd:{}'.format(row['district id']) else: division_id = names_to_ids[row['district name']] if division_id in exclude_divisions: continue if not division_id: raise Exception('unhandled collision: {}'.format(row['district name'])) division = Division.get(division_id) division_name = division.name organization_name = '{} {} Council'.format(division_name, infixes[division.attrs['classification']]) if division_id not in seen: seen.add(division_id) organizations[division_id] = Organization(name=organization_name, classification='government') organizations[division_id].add_source(COUNCIL_PAGE) organization = organizations[division_id] role = row['primary role'] if role not in expected_roles: raise Exception('unexpected role: {}'.format(role)) if row['district id']: district = format(division_id) else: district = division_name organization.add_post(role=role, label=district, division_id=division_id) p = Person(primary_org='government', primary_org_name=organization_name, name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) if row['source url']: p.add_source(row['source url']) if name in duplicate_names: p.birth_date = str(birth_date) birth_date += 1 if row['email']: p.add_contact('email', row['email']) if row['phone']: p.add_contact('voice', row['phone'], 'legislature') if row['twitter']: p.add_link(row['twitter']) p._related[0].extras['boundary_url'] = '/boundaries/census-subdivisions/{}/'.format(division_id.rsplit(':', 1)[1]) yield p for organization in organizations.values(): yield organization
def scrape(self): exclude_divisions = { 'ocd-division/country:ca/csd:1301006', # Saint John 'ocd-division/country:ca/csd:1307022', # Moncton 'ocd-division/country:ca/csd:1310032', # Fredericton } expected_roles = { 'Mayor', 'Councillor', } unique_roles = { 'Mayor', } classifications = { 'Cities': 'City', 'Towns': 'Town', 'Villages': 'Village', 'Rural Communities': 'Community', 'Regional Municipality': 'Regional', } corrections = { 'Beaubassin-est/East': 'Beaubassin East', 'Lac-Baker': 'Lac Baker', 'Saint-François-de-Madawaska': 'Saint-François de Madawaska', 'Saint-Hilaire': 'Saint Hilaire', } unknown_names = { 'Haut-Madawaska', # incorporated after Census 2016 } duplicate_names = { 'Denis Savoie', 'Josée Levesque', 'Luc Levesque', } names_to_ids = {} for division in Division.get('ocd-division/country:ca').children( 'csd'): type_id = division.id.rsplit(':', 1)[1] if type_id.startswith('13'): if division.attrs['classification'] == 'P': continue if division.name in names_to_ids: raise Exception('unhandled collision: {}'.format( division.name)) else: names_to_ids[division.name] = division.id page = self.lxmlize(COUNCIL_PAGE) list_links = page.xpath( '//div[@id="sidebar"]//div[contains(@class, "list")][1]//a') birth_date = 1900 seen = set() assert len(list_links), 'No list items found' for list_link in list_links: page = self.lxmlize(list_link.attrib['href']) detail_urls = page.xpath('//td[1]//@href') assert len(detail_urls), 'No municipalities found' for detail_url in detail_urls: page = self.lxmlize(detail_url, encoding='utf-8') division_name = re.sub( r'\ASt\b\.?', 'Saint', page.xpath('//h1/text()')[0].split(' - ', 1)[1]) division_name = corrections.get(division_name, division_name) if division_name in unknown_names: continue division_id = names_to_ids[division_name] if division_id in exclude_divisions: continue if division_id in seen: raise Exception( 'unhandled collision: {}'.format(division_id)) seen.add(division_id) division_name = Division.get(division_id).name organization_name = '{} {} Council'.format( division_name, classifications[list_link.text]) organization = Organization(name=organization_name, classification='government') organization.add_source(detail_url) address = ', '.join( page.xpath('//div[@class="left_contents"]/p[1]/text()')) contacts = page.xpath( '//div[@class="left_contents"]/p[contains(., "Contact")]/text()' ) phone = contacts[0].split(':')[1] if len(contacts) > 1: fax = contacts[1].split(':')[1] email = self.get_email(page, '//div[@class="left_contents"]', error=False) url = page.xpath( '//div[@class="left_contents"]//@href[not(contains(., "mailto:"))]' ) if url: url = url[0] groups = page.xpath( '//div[contains(@class, "right_contents")]/p') assert len(groups), 'No groups found' for p in groups: role = p.xpath('./b/text()')[0].rstrip('s') if role not in expected_roles: raise Exception('unexpected role: {}'.format(role)) councillors = p.xpath('./text()') assert len(councillors), 'No councillors found' for seat_number, name in enumerate(councillors, 1): if 'vacant' in name.lower(): continue if role in unique_roles: district = division_name else: district = '{} (seat {})'.format( division_name, seat_number) organization.add_post(role=role, label=district, division_id=division_id) p = Person(primary_org='government', primary_org_name=organization_name, name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(list_link.attrib['href']) p.add_source(detail_url) if name in duplicate_names: p.birth_date = str(birth_date) birth_date += 1 p.add_contact('address', address, 'legislature') # @see https://en.wikipedia.org/wiki/Area_code_506 if phone: p.add_contact('voice', phone, 'legislature', area_code=506) if fax: p.add_contact('fax', fax, 'legislature', area_code=506) if email: p.add_contact('email', email) if url: p.add_link(url) p._related[0].extras[ 'boundary_url'] = '/boundaries/census-subdivisions/{}/'.format( division_id.rsplit(':', 1)[1]) yield p yield organization
def scrape(self): exclude_divisions = { 'ocd-division/country:ca/csd:1301006', # Saint John 'ocd-division/country:ca/csd:1307022', # Moncton 'ocd-division/country:ca/csd:1310032', # Fredericton } expected_roles = { 'Mayor', 'Councillor', } unique_roles = { 'Mayor', } classifications = { 'Cities': 'City', 'Towns': 'Town', 'Villages': 'Village', 'Rural Communities': 'Community', 'Regional Municipality': 'Regional', } corrections = { 'Beaubassin-est/East': 'Beaubassin East', 'Lac-Baker': 'Lac Baker', 'Saint-François-de-Madawaska': 'Saint-François de Madawaska', 'Saint-Hilaire': 'Saint Hilaire', } unknown_names = { 'Haut-Madawaska', # incorporated after Census 2016 } duplicate_names = { 'Denis Savoie', 'Josée Levesque', 'Luc Levesque', } names_to_ids = {} for division in Division.get('ocd-division/country:ca').children('csd'): type_id = division.id.rsplit(':', 1)[1] if type_id.startswith('13'): if division.attrs['classification'] == 'P': continue if division.name in names_to_ids: raise Exception('unhandled collision: {}'.format(division.name)) else: names_to_ids[division.name] = division.id page = self.lxmlize(COUNCIL_PAGE) list_links = page.xpath('//div[@id="sidebar"]//div[contains(@class, "list")][1]//a') birth_date = 1900 seen = set() assert len(list_links), 'No list items found' for list_link in list_links: page = self.lxmlize(list_link.attrib['href']) detail_urls = page.xpath('//td[1]//@href') assert len(detail_urls), 'No municipalities found' for detail_url in detail_urls: page = self.lxmlize(detail_url, encoding='utf-8') division_name = re.sub(r'\ASt\b\.?', 'Saint', page.xpath('//h1/text()')[0].split(' - ', 1)[1]) division_name = corrections.get(division_name, division_name) if division_name in unknown_names: continue division_id = names_to_ids[division_name] if division_id in exclude_divisions: continue if division_id in seen: raise Exception('unhandled collision: {}'.format(division_id)) seen.add(division_id) division_name = Division.get(division_id).name organization_name = '{} {} Council'.format(division_name, classifications[list_link.text]) organization = Organization(name=organization_name, classification='government') organization.add_source(detail_url) address = ', '.join(page.xpath('//div[@class="left_contents"]/p[1]/text()')) contacts = page.xpath('//div[@class="left_contents"]/p[contains(., "Contact")]/text()') phone = contacts[0].split(':')[1] if len(contacts) > 1: fax = contacts[1].split(':')[1] email = self.get_email(page, '//div[@class="left_contents"]', error=False) url = page.xpath('//div[@class="left_contents"]//@href[not(contains(., "mailto:"))]') if url: url = url[0] groups = page.xpath('//div[contains(@class, "right_contents")]/p') assert len(groups), 'No groups found' for p in groups: role = p.xpath('./b/text()')[0].rstrip('s') if role not in expected_roles: raise Exception('unexpected role: {}'.format(role)) councillors = p.xpath('./text()') assert len(councillors), 'No councillors found' for seat_number, name in enumerate(councillors, 1): if 'vacant' in name.lower(): continue if role in unique_roles: district = division_name else: district = '{} (seat {})'.format(division_name, seat_number) organization.add_post(role=role, label=district, division_id=division_id) p = Person(primary_org='government', primary_org_name=organization_name, name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(list_link.attrib['href']) p.add_source(detail_url) if name in duplicate_names: p.birth_date = str(birth_date) birth_date += 1 p.add_contact('address', address, 'legislature') # @see https://en.wikipedia.org/wiki/Area_code_506 if phone: p.add_contact('voice', phone, 'legislature', area_code=506) if fax: p.add_contact('fax', fax, 'legislature', area_code=506) if email: p.add_contact('email', email) if url: p.add_link(url) p._related[0].extras['boundary_url'] = '/boundaries/census-subdivisions/{}/'.format(division_id.rsplit(':', 1)[1]) yield p yield organization
def scrape(self): exclude_divisions = {} exclude_districts = { 'Capital', 'Capital F', 'Capital G', 'Capital H', 'Central Coast B', 'Central Okanagan East', 'Central Okanagan West', 'Comox Valley B', 'Comox Valley C', 'Islands Trust', 'Kitimat-Stikine C', 'Kootenay Boundary B', 'Kootenay Boundary C', 'Kootenay Boundary D', 'Kootenay Boundary E', 'Metro Vancouver A', 'North Coast A', 'North Coast C', 'North Coast D', 'North Coast E', 'Okanagan-Similkameen I', 'Okanagan-Similkameen Olalla Local Community Commission', 'Qathet A', 'Qathet B', 'Qathet C', 'Qathet D', 'Qathet E', } expected_roles = { 'candidate', } infixes = { 'CY': 'City', 'DM': 'District', 'IGD': 'District', 'IM': 'Municipal', 'RGM': 'Regional', 'T': 'Town', 'VL': 'Village', 'RDA': 'District', } duplicate_names = { 'Rick Smith', 'Sung Y Wong', 'Elizabeth Taylor', } names_to_ids = {} for division in Division.get('ocd-division/country:ca').children( 'csd'): type_id = division.id.rsplit(':', 1)[1] if type_id.startswith('59'): if division.attrs['classification'] == 'IRI': continue if division.name in names_to_ids: names_to_ids[division.name] = None else: names_to_ids[division.name] = division.id reader = self.csv_reader(COUNCIL_PAGE, header=True) reader.fieldnames = [field.lower() for field in reader.fieldnames] organizations = {} birth_date = 1900 seen = set() for row in reader: name = row['full name'] district_name = row['district name'] if not any(row.values()) or name.lower() in ( '', 'vacant') or district_name in exclude_districts: continue if row['district id']: division_id = 'ocd-division/country:ca/csd:{}'.format( row['district id']) else: division_id = names_to_ids[row['district name']] if division_id in exclude_divisions: continue if not division_id: raise Exception('unhandled collision: {}'.format( row['district name'])) division = Division.get(division_id) division_name = division.name organization_name = '{} {} Council'.format( division_name, infixes[division.attrs['classification']]) if division_id not in seen: seen.add(division_id) organizations[division_id] = Organization( name=organization_name, classification='government') organizations[division_id].add_source(COUNCIL_PAGE) organization = organizations[division_id] role = row['primary role'] if role not in expected_roles: raise Exception('unexpected role: {}'.format(role)) if row['district id']: district = format(division_id) else: district = division_name organization.add_post(role=role, label=district, division_id=division_id) p = Person(primary_org='government', primary_org_name=organization_name, name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) if row['source url']: p.add_source(row['source url']) if name in duplicate_names: p.birth_date = str(birth_date) birth_date += 1 if row['email']: p.add_contact('email', row['email']) if row['phone']: p.add_contact('voice', row['phone'], 'legislature') if row['twitter']: p.add_link(row['twitter']) p._related[0].extras[ 'boundary_url'] = '/boundaries/census-subdivisions/{}/'.format( division_id.rsplit(':', 1)[1]) yield p for organization in organizations.values(): yield organization
def scrape(self): exclude_divisions = { 'ocd-division/country:ca/csd:5909052', # Abbotsford 'ocd-division/country:ca/csd:5915001', # Langley (DM) 'ocd-division/country:ca/csd:5915004', # Surrey 'ocd-division/country:ca/csd:5915015', # Richmond 'ocd-division/country:ca/csd:5915022', # Vancouver 'ocd-division/country:ca/csd:5915025', # Burnaby 'ocd-division/country:ca/csd:5915034', # Coquitlam 'ocd-division/country:ca/csd:5917021', # Saanich 'ocd-division/country:ca/csd:5917034', # Victoria 'ocd-division/country:ca/csd:5935010', # Kelowna } expected_roles = { 'Mayor', 'Councillor', } unique_roles = { 'Mayor', } infixes = { 'CY': 'City', 'DM': 'District', 'IGD': 'District', 'IM': 'Municipal', 'RGM': 'Regional', 'T': 'Town', 'VL': 'Village', } duplicate_names = { 'Colleen Evans', } names_to_ids = {} for division in Division.get('ocd-division/country:ca').children( 'csd'): type_id = division.id.rsplit(':', 1)[1] if type_id.startswith('59'): if division.attrs['classification'] == 'IRI': continue if division.name in names_to_ids: names_to_ids[division.name] = None else: names_to_ids[division.name] = division.id reader = self.csv_reader(COUNCIL_PAGE, header=True) reader.fieldnames = [field.lower() for field in reader.fieldnames] organizations = {} seat_numbers = defaultdict(int) birth_date = 1900 seen = set() for row in reader: name = row['full name'] if not any(row.values()) or 'vacant' in name.lower(): continue if row['district id']: division_id = 'ocd-division/country:ca/csd:{}'.format( row['district id']) else: division_id = names_to_ids[row['district name']] if division_id in exclude_divisions: continue if not division_id: raise Exception('unhandled collision: {}'.format( row['district name'])) division = Division.get(division_id) division_name = division.name organization_name = '{} {} Council'.format( division_name, infixes[division.attrs['classification']]) if division_id not in seen: seen.add(division_id) organizations[division_id] = Organization( name=organization_name, classification='government') organizations[division_id].add_source(COUNCIL_PAGE) organization = organizations[division_id] role = row['primary role'] if role not in expected_roles: raise Exception('unexpected role: {}'.format(role)) if role in unique_roles: district = division_name else: seat_numbers[division_id] += 1 district = '{} (seat {})'.format(division_name, seat_numbers[division_id]) if row['district id']: district += ' ({})'.format(division_id) organization.add_post(role=role, label=district, division_id=division_id) p = Person(primary_org='government', primary_org_name=organization_name, name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) if row['source url']: p.add_source(row['source url']) if name in duplicate_names: p.birth_date = str(birth_date) birth_date += 1 p.add_contact('email', row['email']) p.add_contact('voice', row['phone'], 'legislature') p._related[0].extras[ 'boundary_url'] = '/boundaries/census-subdivisions/{}/'.format( division_id.rsplit(':', 1)[1]) yield p for organization in organizations.values(): yield organization