def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//h1[@class="title"]') for councillor in councillors: if ',' not in councillor.text_content(): continue name, district = councillor.text_content().split(',') name = name.strip() if 'Mayor' in district: p = Person(primary_org='legislature', name=name, district='Beaconsfield', role='Maire') p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./parent::div/parent::div/p//img/@src')[0] phone = councillor.xpath('.//parent::div/following-sibling::div[contains(text(), "514")]/text()')[0] phone = phone.split(':')[1].strip().replace(' ', '-') p.add_contact('voice', phone, 'legislature') script = councillor.xpath('.//parent::div/following-sibling::div/script')[0].text_content() p.add_contact('email', get_email(script)) yield p continue district = district.split('-')[1].strip() p = Person(primary_org='legislature', name=name, district=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./parent::div/parent::div/p//img/@src')[0] phone = councillor.xpath('.//parent::div/following-sibling::p[contains(text(), "514")]/text()') if phone: phone = phone[0] phone = phone.split(':')[1].strip().replace(' ', '-') p.add_contact('voice', phone, 'legislature') script = councillor.xpath('.//parent::div/following-sibling::p/script')[0].text_content() p.add_contact('email', get_email(script)) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table/tbody/tr')[1:] for councillor in councillors: name = councillor.xpath('.//a')[0].text_content() if 'District ' in name: # Vacant continue district = 'District {}'.format(councillor.xpath('.//strong')[0].text_content()) address = councillor.xpath('.//td')[2].text_content().replace("\r\n", ', ') contact_nodes = councillor.xpath('.//td[4]/text()') if ':' not in contact_nodes[0]: contact_nodes = councillor.xpath('.//td[4]/p/text()') phone = contact_nodes[0].split(':')[1].replace("(", '').replace(") ", '-') if 'or' in phone: # phone and cell phone = phone.split('or')[0] # email protected by js p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') if 'F' in contact_nodes[1]: fax = contact_nodes[1].split(':')[1].replace("(", '').replace(") ", '-') p.add_contact('fax', fax, 'legislature') councillor_url = councillor.xpath('.//a/@href')[0] p.add_source(councillor_url) page = self.lxmlize(councillor_url) image = page.xpath('//img[contains(@title, "{0}")]/@src'.format(name)) if image: p.image = image[0] yield p mayorpage = self.lxmlize(MAYOR_PAGE) mayor_name_nodes = mayorpage.xpath('//p/*[contains(text(), "Mayor")]//text()') for node in mayor_name_nodes: result = re.search('Mayor ([A-Z].+ [A-Z].+[^:])', node) if result is not None: name = result.group(1) break photo_url = mayorpage.xpath('//span/img/@src')[0] contact_nodes = mayorpage.xpath('//aside//h3[contains(text(), "Contact")]/following-sibling::div[1]')[0] address = contact_nodes.xpath('.//p[1]/text()')[0] phone = contact_nodes.xpath('.//p[2]/text()')[0].split(': ')[1] fax = contact_nodes.xpath('.//p[2]/text()')[1].split(': ')[1] email = self.get_email(contact_nodes.xpath('.//p[3]')[0]) p = Person(primary_org='legislature', name=name, district='Cape Breton', role='Mayor') p.add_source(MAYOR_PAGE) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('email', email) p.image = photo_url yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) node = page.xpath('//td[@rowspan="2"]')[0] name = node.xpath('.//h3/strong/text()')[0] image = node.xpath('.//@src')[0] voice = self.get_phone(node) url = node.xpath('.//a[contains(., "Visit")]/@href')[0] p = Person(primary_org='legislature', name=name, district='Caledon', role='Mayor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('voice', voice, 'legislature') p.add_contact('email', self.get_email(self.lxmlize(url))) p.image = image yield p councillors = page.xpath('//div[@id="printAreaContent"]//table[2]//td') councillors = councillors[:12] + councillors[16:] assert len(councillors), 'No councillors found' for i in range(len(councillors) // 3): i = i // 4 * 12 + i % 4 district, role = councillors[i].xpath('.//h3/text()') name = councillors[i + 8].xpath('.//strong/text()')[0] voice = self.get_phone(councillors[i + 8]) url = councillors[i + 8].xpath('.//a[contains(., "Visit")]/@href')[0] if 'photo to come' in councillors[i + 4].text_content(): image = None else: image = councillors[i + 4].xpath('.//@src')[0] district = district.replace('\xa0', ' ') if ' and ' in district: district = district.replace('Ward ', 'Wards ') p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('voice', voice, 'legislature') p.add_contact('email', self.get_email(self.lxmlize(url))) if image: p.image = image yield p
def scrape(self): root = self.lxmlize(COUNCIL_PAGE) everyone = root.xpath('//span[@class="Title"]') mayornode = everyone[0] mayor = {} spantext = ' '.join(mayornode.xpath('.//text()')) mayor['name'] = re.search(r'[^(]+', spantext).group(0).strip() mayor['photo_url'] = urljoin(COUNCIL_PAGE, mayornode.xpath('img/@src')[0]) mayor['email'] = mayornode.xpath('following::a[1]/text()')[0] m = Person(primary_org='legislature', name=mayor['name'], district='Charlottetown', role='Mayor') m.add_source(COUNCIL_PAGE) m.add_contact('email', mayor['email']) m.image = mayor['photo_url'] yield m councillors = root.xpath('//span[@class="Title"]')[1:] assert len(councillors), 'No councillors found' for span in councillors: spantext = ' '.join(span.xpath('.//text()')) header = spantext.replace('\u2013', '-').replace('\x96', '-').split('-') if len(header) != 2: continue name = header[0].strip() name = name.replace('Councillor', '') name = re.sub(r'\(.+?\)', '', name) name = ' '.join(name.split()) district_id = ' '.join(header[1].split()[:2]) # needed a wacky xpath to deal with ward 8 photo = span.xpath('preceding::hr[1]/following::img[1]/@src') photo_url = urljoin(COUNCIL_PAGE, photo[0]) email = span.xpath( 'string(following::a[1]/text())') # can be empty p = Person(primary_org='legislature', name=name, district=district_id, role='Councillor') p.add_source(COUNCIL_PAGE) if email: p.add_contact('email', email) p.image = photo_url yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//h1[@class="title"]') for councillor in councillors: if ',' not in councillor.text_content(): continue name, district = councillor.text_content().split(',') name = name.strip() if 'Mayor' in district: p = Person(primary_org='legislature', name=name, district='Beaconsfield', role='Maire') p.add_source(COUNCIL_PAGE) p.image = councillor.xpath( './parent::div/parent::div/p//img/@src')[0] phone = councillor.xpath( './/parent::div/following-sibling::div[contains(text(), "514")]/text()' )[0] phone = phone.split(':')[1].strip().replace(' ', '-') p.add_contact('voice', phone, 'legislature') script = councillor.xpath( './/parent::div/following-sibling::div/script' )[0].text_content() p.add_contact('email', get_email(script)) yield p continue district = district.split('-')[1].strip() p = Person(primary_org='legislature', name=name, district=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.image = councillor.xpath( './parent::div/parent::div/p//img/@src')[0] phone = councillor.xpath( './/parent::div/following-sibling::p[contains(text(), "514")]/text()' ) if phone: phone = phone[0] phone = phone.split(':')[1].strip().replace(' ', '-') p.add_contact('voice', phone, 'legislature') script = councillor.xpath( './/parent::div/following-sibling::p/script')[0].text_content( ) p.add_contact('email', get_email(script)) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) mayor = page.xpath('.//div[@class="item-page clearfix"]//table[1]//p')[1] name = mayor.xpath('.//strong/text()')[0] p = Person(primary_org='legislature', name=name, district='Pointe-Claire', role='Maire') p.add_source(COUNCIL_PAGE) phone = re.findall(r'[0-9]{3}[ -][0-9]{3}-[0-9]{4}', mayor.text_content())[0].replace(' ', '-') p.add_contact('voice', phone, 'legislature') yield p rows = page.xpath('//tr') for i, row in enumerate(rows): if i % 2 == 0: continue councillors = row.xpath('./td') for j, councillor in enumerate(councillors): name = councillor.text_content() # rows[i + 1].xpath('.//td//a[contains(@href, "maps")]/text()')[j] # district number district = rows[i + 1].xpath('.//td/p[1]/text()')[j].replace(' / ', '/') p = Person(primary_org='legislature', name=name, district=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('.//img/@src')[0] phone = re.findall(r'[0-9]{3}[ -][0-9]{3}-[0-9]{4}', rows[i + 1].xpath('.//td')[j].text_content())[0].replace(' ', '-') p.add_contact('voice', phone, 'legislature') yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[contains(@class, "ligne")]') for councillor in councillors: name = ' '.join(councillor.xpath('.//h3')[0].text_content().strip().split(', ')[::-1]) if 'vacant' in name: continue district = councillor.xpath('./preceding-sibling::h2/text()')[-1] if 'Mairie' in district: district = 'Québec' role = 'Maire' else: text = councillor.xpath('.//a[@target="_blank"]/text()') district = re.search('\ADistrict électoral (?:de|du|des) (.+) - ?\d+\Z', text[0].strip().replace('\xa0', ''), flags=re.U).group(1) role = 'Conseiller' if district == 'Monts': district = 'Les Monts' elif district == 'Plateau': district = 'Le Plateau' else: district = re.sub('–', '—', district) # n-dash, m-dash district = re.sub('\Ala ', 'La ', district) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./p//img/@src')[0] phone = self.get_phone(councillor, area_codes=[418]) p.add_contact('voice', phone, 'legislature') yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'utf-8') yield self.scrape_mayor(page) trs = page.xpath('//tbody/tr') assert len(trs), 'No councillors found' seat_number = 1 for tr in trs: if tr.xpath('./td[2]//text()')[0] != 'Vacant': district = tr.xpath('./td[1]/text()')[0] if 'Greenfield Park' in district or 'Conseiller n' in district: district = 'Greenfield Park (siège {})'.format(seat_number) seat_number += 1 detail_url = tr.xpath('./td[2]/a/@href')[0] detail_page = self.lxmlize(detail_url, 'utf-8') name = detail_page.xpath('//h1/text()')[0] photo_node = detail_page.xpath( '//img[contains(@alt, "{0}")]/@src'.format(name)) if photo_node: photo_url = photo_node[0] else: photo_url = detail_page.xpath( '//img[contains(@class, "droite")]/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(detail_url) p.image = photo_url p.add_contact('email', self.get_email(detail_page)) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="article-content"]//td[@class="ms-rteTableOddCol-0"]') yield self.scrape_mayor(councillors[0]) assert len(councillors), 'No councillors found' for councillor in councillors[1:]: if not councillor.xpath('.//a'): continue texts = [text for text in councillor.xpath('.//text()') if clean_string(text)] name = texts[0] district = texts[1] url = councillor.xpath('.//a/@href')[0] page = self.lxmlize(url) p = Person(primary_org='legislature', name=name, district=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = councillor.xpath('./preceding-sibling::td//img/@src')[-1] contacts = page.xpath('.//td[@class="ms-rteTableOddCol-0"]//text()') for contact in contacts: if re.findall(r'[0-9]{4}', contact): phone = contact.strip().replace(' ', '-') p.add_contact('voice', phone, 'legislature') get_links(p, page.xpath('.//td[@class="ms-rteTableOddCol-0"]')[0]) email = self.get_email(page) p.add_contact('email', email) yield p
def scrape(self): member_page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8') table = member_page.xpath('//table')[0] rows = table.xpath('.//tr')[1:] assert len(rows), 'No members found' for row in rows: (namecell, constitcell, partycell) = row.xpath('.//td') full_name = namecell.text_content().strip() if full_name.lower() == 'vacant': continue (last, first) = full_name.split(',') name = first.replace('Hon.', '').strip() + ' ' + last.title().strip() district = ' '.join(constitcell.text_content().split()) party = get_party(partycell.text) url = namecell.xpath('.//a')[0].get('href') page = self.lxmlize(url) email = self.get_email(page) p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=party) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email) image = page.xpath('//img[@class="page_graphic"]/@src') if image: p.image = image[0] yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1') general_contacts = page.xpath('//p[@class="large_title"]/following-sibling::p/text()') general_phone = general_contacts[0] general_fax = general_contacts[1] councillors = page.xpath('//tr/td/p/b') for councillor in councillors: text = councillor.text_content() if '@' in text or 'NEWSLETTER' in text: continue if 'Mayor' in text: name = text.replace('Mayor', '') district = 'Dollard-Des Ormeaux' role = 'Maire' else: name = re.split(r'[0-9]', text)[1] district = 'District ' + re.findall(r'[0-9]', text)[0] role = 'Conseiller' p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./parent::p/parent::td/parent::tr/preceding-sibling::tr//img/@src')[0] email = self.get_email(councillor, './parent::p/following-sibling::p') p.add_contact('email', email) p.add_contact('voice', general_phone, 'legislature') p.add_contact('fax', general_fax, 'legislature') yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT) mayor_url = page.xpath('//a[contains(text(), "Mayor")]/@href')[0] mayor = self.scrape_mayor(mayor_url) if mayor: yield mayor councillors_url = page.xpath('//a[contains(text(), "Councillors")]/@href')[0] cpage = self.lxmlize(councillors_url, user_agent=CUSTOM_USER_AGENT) councillors = cpage.xpath('//tr[td//img]')[:-1] assert len(councillors), 'No councillors found' for councillor_row in councillors: img_cell, info_cell = tuple(councillor_row) if info_cell.xpath('.//p//text()[contains(., "Vacant")]'): continue cells = [x.strip() for x in info_cell.xpath('.//text()') if re.sub('\xa0', ' ', x).strip()] name = cells[0].replace('Councillor ', '') district = info_cell.xpath('.//p[contains(text(), "District")]//text()')[0] email = self.get_email(info_cell) phone = self.get_phone(info_cell, area_codes=[438, 514], error=False) img_url_rel = img_cell.xpath('.//img/@src')[0] img_url = urljoin(councillors_url, img_url_rel) p = Person(primary_org='legislature', name=name, district=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(councillors_url) p.add_contact('email', email) if phone: p.add_contact('voice', phone, 'legislature') p.image = img_url yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table[@id="Table1table"]/tbody/tr') assert len(councillors), 'No councillors found' for councillor in councillors: name = councillor.xpath('./td[2]/p/text()')[1] role = councillor.xpath('./td[2]/p/text()')[0].strip() if role == 'Mayor and Regional Councillor': role = 'Mayor' elif role == 'Local & Regional Councillor': role = 'Regional Councillor' elif role == 'Local Councillor': role = 'Councillor' if len(councillor.xpath('./td[2]/p/text()')) < 3: district = 'Milton' else: district = councillor.xpath('./td[2]/p/text()')[2] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./td[1]/p//img/@src')[0] if councillor == councillors[0]: address = ', '.join(councillor.xpath('./td[3]/p[1]/text()')).replace('Email:', '').strip() p.add_contact('address', address, 'legislature') numbers = councillor.xpath('./td[3]/p[2]/text()') for number in numbers: num_type, number = number.split(':') number = number.replace(', ext ', ' x').strip() p.add_contact(num_type, number, num_type) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="member-container"]') for councillor in councillors: name = councillor.xpath('.//h3')[0].text_content() role = councillor.xpath( './/div[@class="member-position"]')[0].text_content() if 'Maire' in role: role = 'Maire' district = 'Westmount' else: role = 'Conseiller' district = councillor.xpath( './/div[@class="entry-content"]/text()')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath( './/a[@title="Photo pour la presse"]/@href')[0] p.add_contact('email', self.get_email(councillor)) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8') councillors = page.xpath('//div[contains(@class," inner_member")]') assert len(councillors), 'No councillors found' for councillor in councillors: name = councillor.xpath('.//h2/text()')[0] district = councillor.xpath( './/div[contains(@class,"district")]/text()')[0].replace( 'numéro ', '') if 'Maire' in district: district = 'Saint-Jérôme' role = 'Maire' else: role = 'Conseiller' image = councillor.xpath( './/div[@class="portrait_single"]/img/@data-lazy-src')[0] contact = councillor.xpath( './/div[contains(@class,"phone")]/text()')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = image p.add_contact('voice', contact, 'legislature') p.add_contact('email', self.get_email(councillor)) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillor_trs = [tr for tr in page.xpath('//table//tr[1]') if len(tr) == 2][:-1] for councillor_tr in councillor_trs: desc = [text.strip() for text in councillor_tr.xpath('.//text()[normalize-space()]') if text.strip()] if len(desc) == 3: role = 'Maire' district = 'Saint-Jérôme' else: role = 'Conseiller' district = desc[0].replace('numéro ', '') name = desc[-3] phone = desc[-2] email = desc[-1] image = councillor_tr.xpath('.//img/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = image p.add_contact('voice', phone, 'legislature') p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) members = page.xpath('//table[1]//tr') assert len(members), 'No members found' for member in members: if not member.text_content().strip(): continue name = member.xpath('./td[2]//a[1]//text()')[0] district_name = member.xpath('./td[2]//a[contains(.//text(), "MLA")]//text()')[0].split(':')[1].replace('St ', 'St. ').split('-') district = district_name[0].strip() + '-' + district_name[1].strip() url = member.xpath('./td[2]//a[1]/@href')[0] ext_infos = self.scrape_extended_info(url) p = Person(primary_org='legislature', name=name, district=district, role='MLA') p.add_source(COUNCIL_PAGE) p.add_source(url) if ext_infos: # member pages might return errors email, phone, photo_url = ext_infos p.image = photo_url if email: p.add_contact('email', email) if phone: p.add_contact('voice', phone, 'legislature') yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table[@id="Table1table"]/tbody/tr') assert len(councillors), 'No councillors found' for i, councillor in enumerate(councillors): role_district = councillor.xpath('./td[2]/p/text()')[0].strip() if 'Mayor' in role_district: name = role_district.replace('Mayor and Regional Councillor', '') role = 'Mayor' district = 'Milton' else: name = councillor.xpath('./td[2]/p/text()')[1] role, district = re.split(r' (?=Ward)', role_district) if role == 'Town and Regional Councillor': role = 'Regional Councillor' elif role == 'Town Councillor': role = 'Councillor' p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./td[1]/p//img/@src')[0] numbers = councillor.xpath('./td[3]/p[2]/text()') for number in numbers: num_type, number = number.split(':') number = number.replace(', ext ', ' x').strip() p.add_contact(num_type, number, num_type) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1') yield self.scrape_mayor() councillors = page.xpath('//div[@class="articlebody-inside"]//p[contains(text(),"-")]') for councillor in councillors: url = councillor.xpath('.//a')[0].attrib['href'].replace('../', '') page = self.lxmlize(url, 'iso-8859-1') name = page.xpath('//div[@class="articletitle"]/h1')[0].text_content().replace('Councillor', '').replace('Deputy Mayor', '') district = 'Ward {}'.format(re.sub('\D+', '', page.xpath('//div[@class="articlebody-inside"]/p')[0].text_content())) p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) photo_url_rel = page.xpath('//div[@class="articlebody-inside"]/p/img/@src')[0].replace('/..', '') p.image = urljoin(url, photo_url_rel) contacts = page.xpath('//div[@class="articlebody-inside"]/p')[1].text_content().replace('Biography', '').replace('Committees', '').split(':') for i, contact in enumerate(contacts): if i == 0 or not contact: continue contact_type = re.findall(r'([A-Z][a-z]+)', contacts[i - 1])[0] if contact_type != 'Address': contact = re.split(r'[A-Z]', contact)[0] contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type] p.add_contact(contact_type, contact, '' if contact_type == 'email' else 'legislature') yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table[@class="councilTable"]') assert len(councillors), 'No councillors found' for councillor in councillors: image = councillor.xpath('.//@src')[0] alt = councillor.xpath('.//tr/td[1]/p[1]/img/@alt')[0] if 'Mayor' in alt: name = alt district = 'Ajax' role = 'Mayor' else: name, rest = alt.split(' - ', 1) district = rest.split('Councillor ', 1)[-1].strip() role = rest.split('Ward ', 1)[0].strip() cell = councillor.xpath('.//p[contains(.,"Cel")]/text()')[0].replace('\xa0', ' ') voice = councillor.xpath('.//p[contains(.,"Cel")]/text()')[1] email = self.get_email(councillor) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = image if cell: p.add_contact('cell', cell, 'legislature') if voice: p.add_contact('voice', voice, 'legislature') if email: p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) sections = page.xpath('//div[contains(@class, "membres-conseil-municipal")]') for section in sections: councillors = section.xpath('./div') assert len(councillors), 'No councillors found' for councillor in councillors: name = ' '.join(reversed(councillor.xpath('./h3//text()'))) if 'vacant' in name.lower(): continue header = section.xpath('./preceding-sibling::h2/text()')[-1] if 'Mairie' in header: district = 'Québec' role = 'Maire' else: district = councillor.xpath('./p[@itemprop="jobTitle"]/a/text()')[0] district = re.search(r'\ADistrict (?:de(?: la)?|du|des) ([\w —–-]+)', district, flags=re.U).group(1) role = 'Conseiller' if district == 'Saules': district = 'Les Saules' else: district = re.sub(r'–', '—', district) # n-dash, m-dash p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./figure//@src')[0] p.add_contact('voice', self.get_phone(councillor, area_codes=[418]), 'legislature') yield p
def scrape(self): councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[contains(@class, "entry")]')[0].xpath('.//@href') assert len(councillors), 'No councillors found' for url in councillors: if '@' in url: continue page = self.lxmlize(url) main = page.xpath('//main[@id="content"]')[0] name = main.xpath('.//h1//text()')[0] if 'Mayor' in main.text_content(): name = name.replace('Mayor ', '') role = 'Mayor' district = 'Saanich' else: role = 'Councillor' district = 'Saanich (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 p = Person(primary_org='legislature', name=name, district=district, role=role) p.image = page.xpath('.//@src')[0] p.add_contact('voice', self.get_phone(page, area_codes=[250]), 'legislature') p.add_contact('email', self.get_email(page.xpath('//main[@id="content"]')[0])) p.add_source(COUNCIL_PAGE) p.add_source(url) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) for block in page.xpath('//div[@class="addressblock"]'): name_elem = block.xpath('.//a[@class="mpp"]')[0] name = ' '.join(name_elem.text.split()) riding = block.xpath('.//div[@class="riding"]//text()')[0].strip().replace('--', '\u2014') district = riding.replace('Chatham—Kent', 'Chatham-Kent') # m-dash to hyphen mpp_url = name_elem.attrib['href'] mpp_page = self.lxmlize(mpp_url) image = mpp_page.xpath('//img[@class="mppimg"]/@src') party = mpp_page.xpath('//div[@class="mppinfoblock"]/p[last()]/text()')[0].strip() p = Person(primary_org='legislature', name=name, district=district, role='MPP', party=party) if image: p.image = image[0] p.add_source(COUNCIL_PAGE) p.add_source(mpp_url) email = block.xpath('.//div[@class="email"]') if email: p.add_contact('email', self.get_email(email[0])) phone = block.xpath('.//div[@class="phone"]//text()') if phone: p.add_contact('voice', phone[0], 'legislature') yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//p[@class="WSIndent"]/a') for councillor in councillors: district = re.findall(r'(Ward [0-9]{1,2})', councillor.text_content()) if district: district = district[0] name = councillor.text_content().replace(district, '').strip() role = 'Councillor' else: district = 'Kawartha Lakes' name = councillor.text_content().replace('Mayor', '').strip() role = 'Mayor' url = councillor.attrib['href'] page = self.lxmlize(url) email = self.get_email(page) image = page.xpath('//img[@class="image-right"]/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email) p.image = image yield p
def scrape(self): councillor_seat_number = 1 coun_page = self.lxmlize(COUNCIL_PAGE) contact_page = self.lxmlize(CONTACT_PAGE) councillors = coun_page.xpath('//div[@id="main-content"]//h3') contact_data = contact_page.xpath('//p[contains(./strong/text(), "Mayor & Council")]/following-sibling::table[1]//tr')[1:] for councillor, contact in zip(councillors, contact_data): text = councillor.text_content() if text.startswith('Councill'): role = 'Councillor' district = 'Abbotsford (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 else: role = 'Mayor' district = 'Abbotsford' name = text.split(' ', 1)[1] image = councillor.xpath('./img/@src')[0] phone = contact.xpath('./td[2]/text()')[0] fax = contact.xpath('./td[3]/text()')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(CONTACT_PAGE) p.image = image p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') yield p
def scrape(self): councillor_seat_number = 1 contact_page = self.lxmlize(CONTACT_URL) email = self.get_email(contact_page) page = self.lxmlize(COUNCIL_PAGE) for url in page.xpath('//a/@href[contains(., "members/")]'): page = self.lxmlize(url) role, name = page.xpath('//h1//text()')[0].split(' ', 1) photo_url = page.xpath('//img/@src')[0] if role == 'Mayor': district = 'Richmond' else: district = 'Richmond (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 p = Person(primary_org='legislature', name=name, district=district, role=role) p.image = photo_url p.add_source(COUNCIL_PAGE) p.add_source(CONTACT_URL) p.add_source(url) p.add_contact('email', email) yield p
def scrape(self): councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="content"]//table//tr[position() mod 2 = 1]') assert len(councillors), 'No councillors found' for councillor in councillors: text = councillor.xpath('.//strong/text()')[0] if 'Deputy Warden' in text: role = 'Deputy Warden' name = text.replace('Deputy Warden', '') district = 'Lambton' elif 'Warden' in text: role = 'Warden' name = text.replace('Warden', '') district = 'Lambton' else: role = 'Councillor' name = text district = 'Lambton (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('.//img/@src')[0] p.add_contact('email', self.get_email(councillor)) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="printArea"]//strong') for councillor in councillors: info = councillor.xpath('./parent::p/text()') if not info: info = councillor.xpath('./parent::div/text()') info = [x for x in info if x.strip()] district = re.sub(r'(?<=Ward \d).+', '', info.pop(0)) if 'Mayor' in district: district = 'Woolwich' role = 'Mayor' else: district = district.replace('Councillor', '').strip() role = 'Councillor' p = Person(primary_org='legislature', name=councillor.text_content(), district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./img/@src')[0] for contact in info: note, num = contact.split(':') num = num.strip().replace('(', '').replace(') ', '-').replace('extension ', 'x') p.add_contact(note, num, note) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="field-item even"]//tr') assert len(councillors), 'No councillors found' for councillor in councillors: district = councillor.xpath('./td[1]//strong/text()')[0].replace( 'no. ', '') role = 'Conseiller' if 'Maire' in district: district = 'Senneville' role = 'Maire' name = councillor.xpath('./td[2]//p//text()')[0].title() email = self.get_email(councillor) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) try: p.image = councillor.xpath('.//img/@src')[0] except IndexError: pass p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) members = page.xpath('//table[1]//tr') assert len(members), 'No members found' for member in members: if not member.text_content().strip(): continue name = member.xpath('./td[2]//a[1]//text()')[0] district_name = member.xpath( './td[2]//a[contains(.//text(), "MLA")]//text()')[0].split( ':')[1].replace('St ', 'St. ').split('-') district = district_name[0].strip() + '-' + district_name[1].strip( ) url = member.xpath('./td[2]//a[1]/@href')[0] ext_infos = self.scrape_extended_info(url) p = Person(primary_org='legislature', name=name, district=district, role='MLA') p.add_source(COUNCIL_PAGE) p.add_source(url) if ext_infos: # member pages might return errors email, phone, photo_url = ext_infos p.image = photo_url if email: p.add_contact('email', email) if phone: p.add_contact('voice', phone, 'legislature') yield p
def scrape(self): def char(code): try: return chr(int(code)) except ValueError: return code page = self.lxmlize(COUNCIL_PAGE) for row in page.xpath('//div[@id="content"]/table/tbody/tr'): if 'Vacant' not in row.xpath('./td//text()')[0]: full_name, party, district = row.xpath('./td//text()')[:3] name = ' '.join(reversed(full_name.split(','))) p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=self.PARTIES[party]) detail_url = row[0][0].attrib['href'] detail = self.lxmlize(detail_url) image = detail.xpath('//img[@class="portrait"]/@src')[0] p.image = image try: p.add_contact('voice', detail.xpath('//dd[@class="numbers"]/text()')[0].split(': ')[1], 'legislature') except IndexError: pass script = detail.xpath('//dd/script/text()') if script: codes = reversed(re.findall(r"]='(.+?)'", script[0])) content = ''.join(char(code) for code in codes) p.add_contact('email', re.search(r'>(.+)<', content).group(1)) p.add_source(COUNCIL_PAGE) p.add_source(detail_url) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath( '//section[contains(@id, "js-council-member")]') assert len(councillors), 'No councillors found' for index, councillor in enumerate(councillors): name = ' '.join(councillor.xpath('.//h2/text()')) district = councillor.xpath( './/span[contains(@class, "c-info-list_label")][contains(text(), "District ")]' ) role = 'Conseiller' if not district and index == 0: district = 'Pointe-Claire' role = 'Maire' elif district: district = district[0].text_content().split(' – ')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.image = councillor.xpath('.//@src')[0] p.add_contact('email', self.get_email(councillor)) p.add_contact('voice', self.get_phone(councillor, area_codes=[514]), 'legislature') p.add_source(COUNCIL_PAGE) yield p
def scrape(self): regional_councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//center/center//a') for councillor in councillors: name = councillor.text_content().strip() url = councillor.attrib['href'] page = self.lxmlize(url) header = page.xpath( '//div[@class="sectionheading"]')[0].text_content() if header == 'Mayor of Richmond Hill': district = 'Richmond Hill' role = 'Mayor' else: district = re.findall(r',(.*)-', header) if district: district = district[0].strip() else: district = 'Richmond Hill (seat {})'.format( regional_councillor_seat_number) regional_councillor_seat_number += 1 role = 'Regional Councillor' if 'Regional' in header else 'Councillor' info = page.xpath( '//table[@cellpadding>0]/tbody/tr/td[last()]|//table[not(@cellpadding)]/tbody/tr/td[last()]' ) info = info[0].text_content().replace(' - office:', ':') address = re.findall( r'(?<=Town of Richmond Hill)(.*(?=Telephone:)|(?=Telephone))', info)[0] address = re.sub(r'([a-z])([A-Z])', r'\1 \2', address) # I expected to be able to do '(.*)(?=\sTelephone|Telephone|Fax)', but nope. phone = re.findall( r'(?<=Telephone:) ((.*) (?=Telephone)|(.*)(?=Telephone)|(.*)(?=Fax))', info)[0][0].replace('(', '').replace(') ', '-').replace(', ext. ', ' x') fax = re.findall(r'(?<=Fax:) (.*)(?=E-mail)', info)[0].replace( ' ', '').replace('(', '').replace(')', '-') email = self.get_email(page) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email) p.image = page.xpath( '//img[contains(@alt, "{}")]/@src'.format(name))[0] if 'Website' in info: p.add_link(re.findall(r'www\..*\.[a-z]+', info)[0]) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="large_content"]//td/p[2]') assert len(councillors), 'No councillors found' for councillor in councillors: info = councillor.xpath('./strong/text()') # In case the name spans on 2 lines if len(info) > 2 and 'Councillor' not in info[1]: role, district = info[2].split('-') info = [info[0] + info[1], role, district] name = info[0] if 'Vacant' not in info: if len(info) < 3: district = 'Dorval' role = 'Maire' else: district = info[2] role = 'Conseiller' p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./preceding-sibling::p/img/@src')[0] email = self.get_email(councillor) p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'utf-8') councillors = page.xpath('//td[@width="105"]') assert len(councillors), 'No councillors found' for node in councillors: url = urljoin(COUNCIL_PAGE, node.xpath('.//a/@href')[0]) ward = re.search('([A-Z].+) Ward', node.xpath('.//a//text()')[0]).group(1) ward = ward.replace(' – ', '—').replace( ' - ', '—') # n-dash, m-dash, hyphen, m-dash ward = ward.replace('St. Norbert', 'St Norbert') # to match ocd-division-ids name = ' '.join(node.xpath('.//span[@class="k80B"][1]/text()')) yield self.councillor_data(url, name, ward) mayor_node = page.xpath('//td[@width="315"]')[0] mayor_name = mayor_node.xpath('./a//text()')[0][len('Mayor '):] mayor_photo_url = mayor_node.xpath('./img/@src')[0] m = Person(primary_org='legislature', name=mayor_name, district='Winnipeg', role='Mayor') m.add_source(COUNCIL_PAGE) # @see http://www.winnipeg.ca/interhom/mayor/MayorForm.asp?Recipient=CLK-MayorWebMail m.add_contact('email', '*****@*****.**') # hardcoded m.image = mayor_photo_url yield m
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath( '//div[contains(@class, "councillorwrapper")]') assert len(councillors), 'No councillors found' for index, councillor in enumerate(councillors): name = councillor.xpath('.//h4/text()')[0] district = councillor.xpath('.//h4/span/text()')[0].strip() role = 'Councillor' email = None if not district and index == 0: district = 'Calgary' role = 'Mayor' email = '*****@*****.**' p = Person(primary_org='legislature', name=name, district=district, role=role) p.image = councillor.xpath('.//@src')[0] if email: p.add_contact('email', email) p.add_source(COUNCIL_PAGE) yield p
def scrape(self): seat_numbers = defaultdict(int) page = self.lxmlize(COUNCIL_PAGE) yield self.scrape_mayor() councillors = page.xpath('//div[@id="centre_content"]//tr') for councillor in councillors: if 'Position' in councillor.text_content(): continue ward = councillor.xpath('./td')[0].text_content().replace('Councillor', '') seat_numbers[ward] += 1 district = '{} (seat {})'.format(ward, seat_numbers[ward]) name = councillor.xpath('./td')[1].text_content() url = councillor.xpath('./td/a')[0].attrib['href'] p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) page = self.lxmlize(url) content = page.xpath('//div[@id="centre_content"]')[0] email = self.get_email(content) p.add_contact('email', email) p.add_contact('voice', self.get_phone(content, area_codes=[226, 519]), 'legislature') p.image = page.xpath('string(//div[@id="centre_content"]//img/@src)') # can be empty if len(page.xpath('//div[@id="centre_content"]//a')) > 2: p.add_link(page.xpath('//div[@id="centre_content"]//a')[-1].attrib['href']) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) nodes = page.xpath('//div[contains(@class,"cocis-has-caption")]')[1:] for node in nodes: url = urljoin(COUNCIL_PAGE, node.xpath('.//a[1]/@href')[0]) name = node.xpath('.//a//text()')[0] ward = ' '.join(node.xpath('.//strong//text()')[0].split()[:-1]) yield self.councillor_data(url, name, ward) mayor_node = page.xpath( '//div[contains(@class, "cocis-image-panel")]')[0] photo_url = urljoin(COUNCIL_PAGE, mayor_node.xpath('.//img/@src')[0]) name = mayor_node.xpath('.//a//text()')[0] mayor_page = self.lxmlize(MAYOR_PAGE) # Email behind mailhide # email = self.get_email(mayor_page) phone = self.get_phone(mayor_page, area_codes=[403]) m = Person(primary_org='legislature', name=name, district='Calgary', role='Mayor') m.add_source(COUNCIL_PAGE) m.add_source(MAYOR_PAGE) m.add_contact('voice', phone, 'legislature') m.image = photo_url yield m
def scrape(self): councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath( '//div[@id="content"]//table//tr[position() mod 2 = 1]') assert len(councillors), 'No councillors found' for councillor in councillors: text = councillor.xpath('.//strong/text()')[0] if 'Deputy Warden' in text: role = 'Deputy Warden' name = text.replace('Deputy Warden', '') district = 'Lambton' elif 'Warden' in text: role = 'Warden' name = text.replace('Warden', '') district = 'Lambton' else: role = 'Councillor' name = text district = 'Lambton (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('.//img/@src')[0] p.add_contact('email', self.get_email(councillor)) yield p
def scrape_mayor(self): page = self.lxmlize(MAYOR_PAGE, 'iso-8859-1') name = page.xpath( '//div[@class="articletitle"]/h1')[0].text_content().replace( 'Mayor', '') p = Person(primary_org='legislature', name=name, district='Summerside', role='Mayor') p.add_source(MAYOR_PAGE) p.image = page.xpath( '//div[@class="articlebody-inside"]/p/img/@src')[0].replace( '..', '') info = page.xpath('//div[@class="articlebody-inside"]/p') phone = re.findall(r'to (.*)', info[1].text_content())[0] address = info[3].text_content().replace( 'by mail: ', '') + ' ' + info[4].text_content() email = self.get_email(info[5]) p.add_contact('voice', phone, 'legislature') p.add_contact('address', address, 'legislature') p.add_contact('email', email) return p
def scrape(self): councillor_seat_number = 1 contact_page = self.lxmlize(CONTACT_URL) email = self.get_email(contact_page) page = self.lxmlize(COUNCIL_PAGE) urls = page.xpath('//a/@href[contains(., "members/")]') assert len(urls), 'No councillors found' for url in urls: page = self.lxmlize(url) role, name = page.xpath('//h1//text()')[0].split(' ', 1) photo_url = page.xpath('//div[@id="content"]//img/@src')[0] if role == 'Mayor': district = 'Richmond' else: district = 'Richmond (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 p = Person(primary_org='legislature', name=name, district=district, role=role) p.image = photo_url p.add_source(COUNCIL_PAGE) p.add_source(CONTACT_URL) p.add_source(url) p.add_contact('email', email) # same for all yield p
def scrape(self): councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) nodes = page.xpath('//div[@class="view-content"]/div') for node in nodes: fields = node.xpath('./div') role = fields[0].xpath('./div//text()')[0] name = fields[2].xpath('.//a//text()')[0].title().split(role)[-1].strip() if name == 'Vacant': continue if 'Ward' in role: district = role role = 'Councillor' else: if 'At Large' in role: role = 'Councillor at Large' district = "St. John's (seat {})".format(councillor_seat_number) councillor_seat_number += 1 else: district = "St. John's" phone = fields[3].xpath('./div//text()')[0] email = self.get_email(fields[5]) photo_url = node.xpath('.//img/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email) p.image = photo_url yield p
def scrape(self): self.user_agent = CUSTOM_USER_AGENT page = self.get(COUNCIL_PAGE) members = re.findall('/Members/YourMember/[^"]+', page.text) assert len(members), 'No members found' for member in members: detail_url = 'http://www.assembly.nl.ca%s' % member detail = self.lxmlize(detail_url, user_agent=CUSTOM_USER_AGENT) name = detail.xpath('//h1/text()')[0] district = re.sub(r' [\xa0–-] ', '—', detail.xpath('//h2/text()')[0]) # # n-dash, m-dash party = PARTIES[detail.xpath('//h3/text()')[0]] p = Person(primary_org='legislature', name=name, district=district, role='MHA', party=party) p.image = detail.xpath('//img[@class="img-responsive"]/@src')[0] contact = detail.xpath('//div[@class="col-md-12"]')[0] p.add_contact('email', self.get_email(contact)) p.add_source(COUNCIL_PAGE) p.add_source(detail_url) for heading, _type in HEADING_TYPE.items(): node = detail.xpath('//b[.="%s"]/../..' % heading) if node: phone = self.get_phone(node[0], error=False) if phone: p.add_contact('voice', phone, _type) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'utf-8') yield self.scrape_mayor(page) for tr in page.xpath('//tbody/tr'): if tr.xpath('./td[2]//text()')[0] != 'Vacant': district = tr.xpath('./td[1]/text()')[0] if 'Conseiller n' in district: district = 'Greenfield Park' detail_url = tr.xpath('./td[2]/a/@href')[0] detail_page = self.lxmlize(detail_url, 'utf-8') name = detail_page.xpath('//h1/text()')[0] photo_node = detail_page.xpath('//img[contains(@alt, "{0}")]/@src'.format(name)) if photo_node: photo_url = photo_node[0] else: photo_url = detail_page.xpath('//img[contains(@class, "droite")]/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(detail_url) p.image = photo_url yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) # it's all javascript rendered on the client... wow. js = page.xpath('string(//div[@class="inner_container"]/div/script[2])') # allow string() districts = re.findall(r'arrayDistricts\[a.+"(.+)"', js) names = re.findall(r'arrayMembres\[a.+"(.+)"', js) urls = re.findall(r'arrayLiens\[a.+"(.+)"', js) # first item in list is mayor p = Person(primary_org='legislature', name=names[0], district='Gatineau', role='Maire') p.add_source(COUNCIL_PAGE) p.add_source(MAYOR_CONTACT_PAGE) email = '*****@*****.**' # hardcoded p.add_contact('email', email) yield p for raw_district, name, url in list(zip(districts, names, urls))[1:]: if name == 'Vacant': continue profile_url = COUNCIL_PAGE + '/' + url.split('/')[-1] profile_page = self.lxmlize(profile_url) photo_url = profile_page.xpath('//img/@src')[0] district = 'District ' + re.search('\d+', raw_district).group(0) email = self.get_email(profile_page) p = Person(primary_org='legislature', name=name, district=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(profile_url) p.image = photo_url p.add_contact('email', email) yield p
def scrape(self): regional_councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//a[@title="Mayor and Council::Meet Your Council"]/following-sibling::ul//@href') assert len(councillors), 'No councillors found' for councillor in councillors: node = self.lxmlize(councillor).xpath('//div[@id="printArea"]')[0] name = node.xpath('.//h1/text()')[0] if 'Mayor' in name: role = 'Mayor' district = 'Whitby' name = name.replace('Mayor ', '') else: role = node.xpath('.//h2/text()')[0] if 'Regional Councillor' in role: district = 'Whitby (seat {})'.format(regional_councillor_seat_number) regional_councillor_seat_number += 1 else: role, district = role.split(', ') district = district.split(' (')[0] image = node.xpath('.//img/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('voice', self.get_phone(node), 'legislature') p.add_contact('email', self.get_email(node)) p.image = image yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1') yield self.scrape_mayor() councillors = page.xpath('//div[@class="articlebody-inside"]//p[contains(text(),"-")]') for councillor in councillors: url = councillor.xpath('.//a')[0].attrib['href'].replace('../', '') page = self.lxmlize(url, 'iso-8859-1') name = page.xpath('//div[@class="articletitle"]/h1')[0].text_content().replace('Councillor', '').replace('Deputy Mayor', '') district = 'Ward {}'.format(re.sub(r'\D+', '', page.xpath('//div[@class="articlebody-inside"]/p')[0].text_content())) p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) photo_url_rel = page.xpath('//div[@class="articlebody-inside"]/p/img/@src')[0].replace('/..', '') p.image = urljoin(url, photo_url_rel) contacts = page.xpath('//div[@class="articlebody-inside"]/p')[1].text_content().replace('Biography', '').replace('Committees', '').split(':') for i, contact in enumerate(contacts): if i == 0 or not contact: continue contact_type = re.findall(r'([A-Z][a-z]+)', contacts[i - 1])[0] if contact_type != 'Address': contact = re.split(r'[A-Z]', contact)[0] contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type] p.add_contact(contact_type, contact, '' if contact_type == 'email' else 'legislature') yield p
def scrape_mayor(self, div): name = div.xpath('.//a')[0].text_content() url = div.xpath('.//a/@href')[0] page = self.lxmlize(url) contact_url = page.xpath('//a[@title="Joindre le maire"]/@href')[0] contact_page = self.lxmlize(contact_url) p = Person(primary_org='legislature', name=name, district='Saint-Jean-sur-Richelieu', role='Maire') p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_source(contact_url) p.image = div.xpath('./preceding-sibling::td//img/@src')[-1] contacts = contact_page.xpath( '//div[@id="ctl00_PlaceHolderMain_ctl01_ctl01__ControlWrapper_RichHtmlField"]//div/font/text()' ) address = ' '.join(contacts[:4]) phone = contacts[-3].split(':')[1].strip().replace(' ', '-') fax = contacts[-2].split(':')[1].strip().replace(' ', '-') p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') # mayor's email is a form return p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table[@id="MLAs"]//tr')[1:] for councillor in councillors: if 'Vacant' not in councillor.xpath('./td')[0].text_content(): name = councillor.xpath('./td')[0].text_content().split('. ', 1)[1] party = councillor.xpath('./td')[1].text district = councillor.xpath('./td')[2].text_content() url = councillor.xpath('./td[1]/a/@href')[0] page = self.lxmlize(url) p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=party) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('//div[contains(@class, "mla-image-cell")]/img/@src')[0] contact = page.xpath('//div[@id="mla-contact"]/div[2]')[0] website = contact.xpath('./div[3]/div[3]/div[2]/a') if website: p.add_link(website[0].text_content()) p.add_contact('address', ' '.join(contact.xpath('.//div[@class="col-md-4"][2]/div//text()')[1:9]), 'constituency') phone_leg = contact.xpath('.//span[@id="MainContent_ContentBottom_Property6"]//text()')[0] phone_const = contact.xpath('.//div[@class="col-md-4"]/div[4]/span/span/text()')[0] p.add_contact('voice', phone_leg, 'legislature', area_code=306) p.add_contact('voice', phone_const, 'constituency', area_code=306) email = self.get_email(contact) p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1') councillors = page.xpath('//div[@id="PageContent"]/table/tbody/tr/td') assert len(councillors), 'No councillors found' for councillor in councillors: if not councillor.text_content().strip(): continue if councillor == councillors[0]: district = 'Kirkland' role = 'Maire' else: district = councillor.xpath('.//h2')[0].text_content() district = re.search('- (.+)', district).group(1).strip() district = district.replace(' Ouest', ' ouest').replace(' Est', ' est') role = 'Conseiller' name = councillor.xpath('.//strong/text()')[0] phone = councillor.xpath( './/div[contains(text(), "#")]/text()')[0].replace( 'T ', '').replace(' ', '-').replace(',-#-', ' x') email = self.get_email(councillor) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email) p.image = councillor.xpath('.//img/@src')[0] yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath( '//div[contains(@class, "view-people")]//div[contains(@class, "views-row")]' ) assert len(councillors), 'No councillors found' for councillor in councillors: name = councillor.xpath( './/div[@property="dc:title"]')[0].text_content() role_and_district = councillor.xpath( './/div[contains(@class, "field-name-field-sub-title")]//p' )[-2].text_content().replace('\xa0', ' ') if role_and_district == 'Mayor': district = 'Fredericton' role = 'Mayor' else: district = role_and_district.split(', ', 1)[1] role = 'Councillor' url = councillor.xpath('.//@href')[0] page = self.lxmlize(url) p = Person(primary_org='legislature', name=name, district=district, role=role) p.image = councillor.xpath('.//img[@typeof="foaf:Image"]/@src')[0] p.add_contact('email', self.get_email(page)) p.add_contact('voice', self.get_phone(page, area_codes=[506]), 'legislature') p.add_source(COUNCIL_PAGE) p.add_source(url) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="block text"]') assert len(councillors), 'No councillors found' for i, councillor in enumerate(councillors): name = councillor.xpath( './/div[@class="content-writable"]//strong/text()')[0] district = councillor.xpath('.//h2/text()')[0] if 'Maire' in district: district = 'Sainte-Anne-de-Bellevue' role = 'Maire' else: district = 'District {}'.format(re.search(r'\d+', district)[0]) role = 'Conseiller' p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('.//@src')[0] p.add_contact('email', self.get_email(councillor)) yield p
def councillor_data(self, url, name, ward): page = self.lxmlize(url) # sadly, email is a form on a separate page photo_url_rel = page.xpath( '//div[contains(@id, "contentcontainer")]//img/@src')[0] photo_url = urljoin(url, photo_url_rel) m = Person(primary_org='legislature', name=name, district=ward, role='Councillor') m.add_source(COUNCIL_PAGE) m.add_source(url) phone = self.get_phone(page.xpath('//div[@id="contentcontainer"]')[0], area_codes=[306], error=False) if phone: m.add_contact('voice', phone, 'legislature') else: phone = self.get_phone( page.xpath('//div[@id="lowercontentcontainer"]')[0], area_codes=[306], error=False) if phone: m.add_contact('voice', phone, 'legislature') m.image = photo_url yield m