def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1') councillors = page.xpath('//div[@id="PageContent"]/table/tbody/tr/td') assert len(councillors), 'No councillors found' for councillor in councillors: if not councillor.text_content().strip(): continue if councillor == councillors[0]: district = 'Kirkland' role = 'Maire' else: district = councillor.xpath('.//h2')[0].text_content() district = re.search('- (.+)', district).group(1).strip() district = district.replace(' Ouest', ' ouest').replace(' Est', ' est') role = 'Conseiller' name = councillor.xpath('.//strong/text()')[0] phone = councillor.xpath( './/div[contains(text(), "#")]/text()')[0].replace( 'T ', '').replace(' ', '-').replace(',-#-', ' x') email = self.get_email(councillor) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email) p.image = councillor.xpath('.//img/@src')[0] yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="field-item even"]//tr') assert len(councillors), 'No councillors found' for councillor in councillors: district = councillor.xpath('./td[1]//strong/text()')[0].replace( 'no. ', '') role = 'Conseiller' if 'Maire' in district: district = 'Senneville' role = 'Maire' name = councillor.xpath('./td[2]//p//text()')[0].title() email = self.get_email(councillor) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) try: p.image = councillor.xpath('.//img/@src')[0] except IndexError: pass p.add_contact('email', email) yield p
def scrape(self): self.user_agent = CUSTOM_USER_AGENT page = self.get(COUNCIL_PAGE) members = re.findall('/Members/YourMember/[^"]+', page.text) assert len(members), 'No members found' for member in members: detail_url = 'http://www.assembly.nl.ca%s' % member detail = self.lxmlize(detail_url, user_agent=CUSTOM_USER_AGENT) name = detail.xpath('//h1/text()')[0] district = re.sub(r' [\xa0–-] ', '—', detail.xpath('//h2/text()')[0]) # # n-dash, m-dash party = PARTIES[detail.xpath('//h3/text()')[0]] p = Person(primary_org='legislature', name=name, district=district, role='MHA', party=party) p.image = detail.xpath('//img[@class="img-responsive"]/@src')[0] contact = detail.xpath('//div[@class="col-md-12"]')[0] p.add_contact('email', self.get_email(contact)) p.add_source(COUNCIL_PAGE) p.add_source(detail_url) for heading, _type in HEADING_TYPE.items(): node = detail.xpath('//b[.="%s"]/../..' % heading) if node: phone = self.get_phone(node[0], error=False) if phone: p.add_contact('voice', phone, _type) yield p
def scrape(self): councillor_seat_number = 1 contact_page = self.lxmlize(CONTACT_URL) email = self.get_email(contact_page) page = self.lxmlize(COUNCIL_PAGE) urls = page.xpath('//a/@href[contains(., "members/")]') assert len(urls), 'No councillors found' for url in urls: page = self.lxmlize(url) role, name = page.xpath('//h1//text()')[0].split(' ', 1) photo_url = page.xpath('//div[@id="content"]//img/@src')[0] if role == 'Mayor': district = 'Richmond' else: district = 'Richmond (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 p = Person(primary_org='legislature', name=name, district=district, role=role) p.image = photo_url p.add_source(COUNCIL_PAGE) p.add_source(CONTACT_URL) p.add_source(url) p.add_contact('email', email) # same for all yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) mayor_info = page.xpath('//h2[contains(text(), "MAYOR")]//following-sibling::p')[0] yield self.scrape_mayor(mayor_info) wards = page.xpath('//h3') for ward in wards: district = re.sub('\AWARD \d+ - ', '', ward.text_content()) councillors = ward.xpath('following-sibling::p') for councillor in councillors: name = councillor.xpath('./strong')[0].text_content() p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) info = councillor.xpath('./text()') address = info.pop(0) p.add_contact('address', address, 'legislature') # get phone numbers for line in info: stuff = re.split(r'(\xbb)|(\xa0)', line) tmp = [y for y in stuff if y and not re.match(r'\xa0', y)] self.get_tel_numbers(tmp, p) email = self.get_email(councillor) p.add_contact('email', email) yield p if councillor == councillors[1]: break
def scrape(self): councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) nodes = page.xpath('//div[@class="view-content"]/div') for node in nodes: fields = node.xpath('./div') role = fields[0].xpath('./div//text()')[0] name = fields[2].xpath('.//a//text()')[0].title().split(role)[-1].strip() if name == 'Vacant': continue if 'Ward' in role: district = role role = 'Councillor' else: if 'At Large' in role: role = 'Councillor at Large' district = "St. John's (seat {})".format(councillor_seat_number) councillor_seat_number += 1 else: district = "St. John's" phone = fields[3].xpath('./div//text()')[0] email = self.get_email(fields[5]) photo_url = node.xpath('.//img/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email) p.image = photo_url yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillor_trs = [tr for tr in page.xpath('//table//tr[1]') if len(tr) == 2][:-1] for councillor_tr in councillor_trs: desc = [text.strip() for text in councillor_tr.xpath('.//text()[normalize-space()]') if text.strip()] if len(desc) == 3: role = 'Maire' district = 'Saint-Jérôme' else: role = 'Conseiller' district = desc[0].replace('numéro ', '') name = desc[-3] phone = desc[-2] email = desc[-1] image = councillor_tr.xpath('.//img/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = image p.add_contact('voice', phone, 'legislature') p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath( '//div[contains(@class, "councillorwrapper")]') assert len(councillors), 'No councillors found' for index, councillor in enumerate(councillors): name = councillor.xpath('.//h4/text()')[0] district = councillor.xpath('.//h4/span/text()')[0].strip() role = 'Councillor' email = None if not district and index == 0: district = 'Calgary' role = 'Mayor' email = '*****@*****.**' p = Person(primary_org='legislature', name=name, district=district, role=role) p.image = councillor.xpath('.//@src')[0] if email: p.add_contact('email', email) p.add_source(COUNCIL_PAGE) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table[@id="Table1table"]/tbody/tr') assert len(councillors), 'No councillors found' for i, councillor in enumerate(councillors): role_district = councillor.xpath('./td[2]/p/text()')[0].strip() if 'Mayor' in role_district: name = role_district.replace('Mayor and Regional Councillor', '') role = 'Mayor' district = 'Milton' else: name = councillor.xpath('./td[2]/p/text()')[1] role, district = re.split(r' (?=Ward)', role_district) if role == 'Town and Regional Councillor': role = 'Regional Councillor' elif role == 'Town Councillor': role = 'Councillor' p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./td[1]/p//img/@src')[0] numbers = councillor.xpath('./td[3]/p[2]/text()') for number in numbers: num_type, number = number.split(':') number = number.replace(', ext ', ' x').strip() p.add_contact(num_type, number, num_type) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) corrections = { 'Mackenzie Delta': 'Mackenzie-Delta', 'Tu Nedhe - Wiilideh': 'Tu Nedhe', } member_cells = page.xpath('//div[@class="views-field views-field-field-picture"]/parent::td') for cell in member_cells: name = cell[1].text_content().replace(' .', '. ') # typo on page riding = cell[2].text_content().strip() riding = corrections.get(riding, riding) detail_url = cell[0].xpath('.//a/@href')[0] detail_page = self.lxmlize(detail_url) photo_url = detail_page.xpath('//div[@class="field-item even"]/img/@src')[0] email = self.get_email(detail_page) contact_text = ''.join(detail_page.xpath('//div[@property="content:encoded"]/p[1]//text()')) phone = re.search(r'P(hone)?: ([-0-9]+)', contact_text) p = Person(primary_org='legislature', name=name, district=riding, role='MLA', image=photo_url) p.add_source(COUNCIL_PAGE) p.add_source(detail_url) p.add_contact('email', email) if phone: p.add_contact('voice', phone.group(2), 'legislature') yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) # it's all javascript rendered on the client... wow. js = page.xpath('string(//div[@class="inner_container"]/div/script[2])') # allow string() districts = re.findall(r'arrayDistricts\[a.+"(.+)"', js) names = re.findall(r'arrayMembres\[a.+"(.+)"', js) urls = re.findall(r'arrayLiens\[a.+"(.+)"', js) # first item in list is mayor p = Person(primary_org='legislature', name=names[0], district='Gatineau', role='Maire') p.add_source(COUNCIL_PAGE) p.add_source(MAYOR_CONTACT_PAGE) email = '*****@*****.**' # hardcoded p.add_contact('email', email) yield p for raw_district, name, url in list(zip(districts, names, urls))[1:]: if name == 'Vacant': continue profile_url = COUNCIL_PAGE + '/' + url.split('/')[-1] profile_page = self.lxmlize(profile_url) photo_url = profile_page.xpath('//img/@src')[0] district = 'District ' + re.search('\d+', raw_district).group(0) email = self.get_email(profile_page) p = Person(primary_org='legislature', name=name, district=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(profile_url) p.image = photo_url p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) members = page.xpath('//table[1]//tr') assert len(members), 'No members found' for member in members: if not member.text_content().strip(): continue name = member.xpath('./td[2]//a[1]//text()')[0] district_name = member.xpath('./td[2]//a[contains(.//text(), "MLA")]//text()')[0].split(':')[1].replace('St ', 'St. ').split('-') district = district_name[0].strip() + '-' + district_name[1].strip() url = member.xpath('./td[2]//a[1]/@href')[0] ext_infos = self.scrape_extended_info(url) p = Person(primary_org='legislature', name=name, district=district, role='MLA') p.add_source(COUNCIL_PAGE) p.add_source(url) if ext_infos: # member pages might return errors email, phone, photo_url = ext_infos p.image = photo_url if email: p.add_contact('email', email) if phone: p.add_contact('voice', phone, 'legislature') yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="large_content"]//td/p[2]') assert len(councillors), 'No councillors found' for councillor in councillors: info = councillor.xpath('./strong/text()') # In case the name spans on 2 lines if len(info) > 2 and 'Councillor' not in info[1]: role, district = info[2].split('-') info = [info[0] + info[1], role, district] name = info[0] if 'Vacant' not in info: if len(info) < 3: district = 'Dorval' role = 'Maire' else: district = info[2] role = 'Conseiller' p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./preceding-sibling::p/img/@src')[0] email = self.get_email(councillor) p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) mayor = page.xpath('.//div[@class="item-page clearfix"]//table[1]//p')[1] name = mayor.xpath('.//strong/text()')[0] p = Person(primary_org='legislature', name=name, district='Pointe-Claire', role='Maire') p.add_source(COUNCIL_PAGE) phone = re.findall(r'[0-9]{3}[ -][0-9]{3}-[0-9]{4}', mayor.text_content())[0].replace(' ', '-') p.add_contact('voice', phone, 'legislature') yield p rows = page.xpath('//tr') for i, row in enumerate(rows): if i % 2 == 0: continue councillors = row.xpath('./td') for j, councillor in enumerate(councillors): name = councillor.text_content() # rows[i + 1].xpath('.//td//a[contains(@href, "maps")]/text()')[j] # district number district = rows[i + 1].xpath('.//td/p[1]/text()')[j].replace(' / ', '/') p = Person(primary_org='legislature', name=name, district=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('.//img/@src')[0] phone = re.findall(r'[0-9]{3}[ -][0-9]{3}-[0-9]{4}', rows[i + 1].xpath('.//td')[j].text_content())[0].replace(' ', '-') p.add_contact('voice', phone, 'legislature') yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="member-container"]') for councillor in councillors: name = councillor.xpath('.//h3')[0].text_content() role = councillor.xpath( './/div[@class="member-position"]')[0].text_content() if 'Maire' in role: role = 'Maire' district = 'Westmount' else: role = 'Conseiller' district = councillor.xpath( './/div[@class="entry-content"]/text()')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath( './/a[@title="Photo pour la presse"]/@href')[0] p.add_contact('email', self.get_email(councillor)) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'utf-8') councillors = page.xpath('//td[@width="105"]') assert len(councillors), 'No councillors found' for node in councillors: url = urljoin(COUNCIL_PAGE, node.xpath('.//a/@href')[0]) ward = re.search('([A-Z].+) Ward', node.xpath('.//a//text()')[0]).group(1) ward = ward.replace(' – ', '—').replace( ' - ', '—') # n-dash, m-dash, hyphen, m-dash ward = ward.replace('St. Norbert', 'St Norbert') # to match ocd-division-ids name = ' '.join(node.xpath('.//span[@class="k80B"][1]/text()')) yield self.councillor_data(url, name, ward) mayor_node = page.xpath('//td[@width="315"]')[0] mayor_name = mayor_node.xpath('./a//text()')[0][len('Mayor '):] mayor_photo_url = mayor_node.xpath('./img/@src')[0] m = Person(primary_org='legislature', name=mayor_name, district='Winnipeg', role='Mayor') m.add_source(COUNCIL_PAGE) # @see http://www.winnipeg.ca/interhom/mayor/MayorForm.asp?Recipient=CLK-MayorWebMail m.add_contact('email', '*****@*****.**') # hardcoded m.image = mayor_photo_url yield m
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8') councillors = page.xpath('//div[contains(@class," inner_member")]') assert len(councillors), 'No councillors found' for councillor in councillors: name = councillor.xpath('.//h2/text()')[0] district = councillor.xpath( './/div[contains(@class,"district")]/text()')[0].replace( 'numéro ', '') if 'Maire' in district: district = 'Saint-Jérôme' role = 'Maire' else: role = 'Conseiller' image = councillor.xpath( './/div[@class="portrait_single"]/img/@data-lazy-src')[0] contact = councillor.xpath( './/div[contains(@class,"phone")]/text()')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = image p.add_contact('voice', contact, 'legislature') p.add_contact('email', self.get_email(councillor)) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) nodes = page.xpath('//div[contains(@class,"cocis-has-caption")]')[1:] for node in nodes: url = urljoin(COUNCIL_PAGE, node.xpath('.//a[1]/@href')[0]) name = node.xpath('.//a//text()')[0] ward = ' '.join(node.xpath('.//strong//text()')[0].split()[:-1]) yield self.councillor_data(url, name, ward) mayor_node = page.xpath( '//div[contains(@class, "cocis-image-panel")]')[0] photo_url = urljoin(COUNCIL_PAGE, mayor_node.xpath('.//img/@src')[0]) name = mayor_node.xpath('.//a//text()')[0] mayor_page = self.lxmlize(MAYOR_PAGE) # Email behind mailhide # email = self.get_email(mayor_page) phone = self.get_phone(mayor_page, area_codes=[403]) m = Person(primary_org='legislature', name=name, district='Calgary', role='Mayor') m.add_source(COUNCIL_PAGE) m.add_source(MAYOR_PAGE) m.add_contact('voice', phone, 'legislature') m.image = photo_url yield m
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) yield self.scrape_mayor( page.xpath('//div[@class="img_four"][1]/div[1]')[0]) councillors = page.xpath('//div[@class="img_four"][2]/div') for councillor_elem in councillors: name, position = councillor_elem.xpath('string(./p/strong)').split( ',') # allow string() position = position.strip() position, district = position.split(' ', 1) district = post_number(district) addr = '\n'.join( addr_str.strip() for addr_str in councillor_elem.xpath('./p/text()')).strip() phone = councillor_elem.xpath( './/a[starts-with(@href, "tel:")]//text()')[0] image = councillor_elem.xpath('.//img[1]/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role=position, image=image) p.add_source(COUNCIL_PAGE) p.add_contact('address', addr, 'legislature') p.add_contact('voice', phone, 'legislature') yield p
def scrape(self): # mayor first, can't find email page = self.lxmlize(MAYOR_URL) photo_url = page.xpath('//img/@src[contains(., "maire")]')[0] name = page.xpath('//td[@class="contenu"]/text()[last()]')[0] p = Person(primary_org='legislature', name=name, district="Trois-Rivières", role="Maire", image=photo_url) p.add_source(MAYOR_URL) yield p resp = self.get(COUNCIL_PAGE) # page rendering through JS on the client page_re = re.compile(r'createItemNiv3.+"District (.+?)".+(index.+)\\"') for district, url_rel in page_re.findall(resp.text): if district not in ('des Estacades', 'des Plateaux', 'des Terrasses', 'du Sanctuaire'): district = re.sub('\A(?:de(?: la)?|des|du) ', '', district) url = urljoin(COUNCIL_PAGE, url_rel) page = self.lxmlize(url) name_content = page.xpath('//h2//text()') if name_content: name = name_content[0] email = self.get_email(page) photo_url = page.xpath('//img/@src[contains(., "Conseiller")]')[0] p = Person(primary_org='legislature', name=name, district=district, role='Conseiller', image=photo_url) p.add_source(url) p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT) mayor_url = page.xpath('//a[contains(text(), "Mayor")]/@href')[0] mayor = self.scrape_mayor(mayor_url) if mayor: yield mayor councillors_url = page.xpath('//a[contains(text(), "Councillors")]/@href')[0] cpage = self.lxmlize(councillors_url, user_agent=CUSTOM_USER_AGENT) councillors = cpage.xpath('//tr[td//img]')[:-1] assert len(councillors), 'No councillors found' for councillor_row in councillors: img_cell, info_cell = tuple(councillor_row) if info_cell.xpath('.//p//text()[contains(., "Vacant")]'): continue cells = [x.strip() for x in info_cell.xpath('.//text()') if re.sub('\xa0', ' ', x).strip()] name = cells[0].replace('Councillor ', '') district = info_cell.xpath('.//p[contains(text(), "District")]//text()')[0] email = self.get_email(info_cell) phone = self.get_phone(info_cell, area_codes=[438, 514], error=False) img_url_rel = img_cell.xpath('.//img/@src')[0] img_url = urljoin(councillors_url, img_url_rel) p = Person(primary_org='legislature', name=name, district=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(councillors_url) p.add_contact('email', email) if phone: p.add_contact('voice', phone, 'legislature') p.image = img_url yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT, encoding='windows-1252') councillors = page.xpath('//table[@width="800"]/tr') assert len(councillors), 'No councillors found' for councillor in councillors: if councillor == councillors[0]: name = councillor.xpath('.//strong/text()')[0].replace( 'Monsieur', '').replace('Madame', '').strip() role = 'Maire' district = 'Mercier' else: name = councillor.xpath('.//strong/text()')[0].replace( 'Monsieur', '').replace('Madame', '').strip() role = 'Conseiller' district = 'District {}'.format( re.search(r'(\d)', councillor.xpath('.//text()')[3]).group(1)) email = self.get_email(councillor) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('email', email) yield p
def scrape(self): councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) for person_url in page.xpath('//h4/a/@href'): page = self.lxmlize(person_url) role, name = page.xpath('//title//text()')[0].split(' ', 1) photo_url = page.xpath('//div[@id="content"]//img[@style]/@src')[0] contact_node = page.xpath('//div[@id="column-right"]//div[contains(., "Contact")]') if contact_node: email = self.get_email(contact_node[0]) phone = self.get_phone(contact_node[0], area_codes=[604, 778]) if role == 'Mayor': district = 'Burnaby' else: district = 'Burnaby (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 p = Person(primary_org='legislature', name=name, district=district, role=role, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_source(person_url) p.add_contact('email', email) if phone: p.add_contact('voice', phone, 'legislature') yield p
def scrape(self): councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath( '//div[@id="content"]//table//tr[position() mod 2 = 1]') assert len(councillors), 'No councillors found' for councillor in councillors: text = councillor.xpath('.//strong/text()')[0] if 'Deputy Warden' in text: role = 'Deputy Warden' name = text.replace('Deputy Warden', '') district = 'Lambton' elif 'Warden' in text: role = 'Warden' name = text.replace('Warden', '') district = 'Lambton' else: role = 'Councillor' name = text district = 'Lambton (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('.//img/@src')[0] p.add_contact('email', self.get_email(councillor)) yield p
def scrape(self): councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[contains(@class, "entry")]')[0].xpath('.//@href') assert len(councillors), 'No councillors found' for url in councillors: if '@' in url: continue page = self.lxmlize(url) main = page.xpath('//main[@id="content"]')[0] name = main.xpath('.//h1//text()')[0] if 'Mayor' in main.text_content(): name = name.replace('Mayor ', '') role = 'Mayor' district = 'Saanich' else: role = 'Councillor' district = 'Saanich (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 p = Person(primary_org='legislature', name=name, district=district, role=role) p.image = page.xpath('.//@src')[0] p.add_contact('voice', self.get_phone(page, area_codes=[250]), 'legislature') p.add_contact('email', self.get_email(page.xpath('//main[@id="content"]')[0])) p.add_source(COUNCIL_PAGE) p.add_source(url) yield p
def scrape(self): regional_councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) yield self.scrape_mayor(page) councillors = page.xpath('//h3[contains(text(), "Councillors")]/following-sibling::p')[:-1] assert len(councillors), 'No councillors found' for councillor_node in councillors: text = councillor_node.xpath('./strong/text()') if not text or 'Vacant' in text: continue name, role_district = text name = name.rstrip(',') if 'Regional Councillor' in role_district: role = role_district district = 'Whitby (seat {})'.format(regional_councillor_seat_number) regional_councillor_seat_number += 1 else: role, district = role_district.strip().split(', ') district = district.split(' (')[0] email = self.get_email(councillor_node) image = councillor_node.xpath('./img/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role=role, image=image) p.add_source(COUNCIL_PAGE) p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//p[@class="WSIndent"]/a') for councillor in councillors: district = re.findall(r'(Ward [0-9]{1,2})', councillor.text_content()) if district: district = district[0] name = councillor.text_content().replace(district, '').strip() role = 'Councillor' else: district = 'Kawartha Lakes' name = councillor.text_content().replace('Mayor', '').strip() role = 'Mayor' url = councillor.attrib['href'] page = self.lxmlize(url) email = self.get_email(page) image = page.xpath('//img[@class="image-right"]/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email) p.image = image yield p
def scrape(self): def char(code): try: return chr(int(code)) except ValueError: return code page = self.lxmlize(COUNCIL_PAGE) for row in page.xpath('//div[@id="content"]/table/tbody/tr'): if 'Vacant' not in row.xpath('./td//text()')[0]: full_name, party, district = row.xpath('./td//text()')[:3] name = ' '.join(reversed(full_name.split(','))) p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=self.PARTIES[party]) detail_url = row[0][0].attrib['href'] detail = self.lxmlize(detail_url) image = detail.xpath('//img[@class="portrait"]/@src')[0] p.image = image try: p.add_contact('voice', detail.xpath('//dd[@class="numbers"]/text()')[0].split(': ')[1], 'legislature') except IndexError: pass script = detail.xpath('//dd/script/text()') if script: codes = reversed(re.findall(r"]='(.+?)'", script[0])) content = ''.join(char(code) for code in codes) p.add_contact('email', re.search(r'>(.+)<', content).group(1)) p.add_source(COUNCIL_PAGE) p.add_source(detail_url) yield p
def scrape(self): councillor_seat_number = 1 contact_page = self.lxmlize(CONTACT_URL) email = self.get_email(contact_page) page = self.lxmlize(COUNCIL_PAGE) for url in page.xpath('//a/@href[contains(., "members/")]'): page = self.lxmlize(url) role, name = page.xpath('//h1//text()')[0].split(' ', 1) photo_url = page.xpath('//img/@src')[0] if role == 'Mayor': district = 'Richmond' else: district = 'Richmond (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 p = Person(primary_org='legislature', name=name, district=district, role=role) p.image = photo_url p.add_source(COUNCIL_PAGE) p.add_source(CONTACT_URL) p.add_source(url) p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="article-content"]//td[@class="ms-rteTableOddCol-0"]') yield self.scrape_mayor(councillors[0]) assert len(councillors), 'No councillors found' for councillor in councillors[1:]: if not councillor.xpath('.//a'): continue texts = [text for text in councillor.xpath('.//text()') if clean_string(text)] name = texts[0] district = texts[1] url = councillor.xpath('.//a/@href')[0] page = self.lxmlize(url) p = Person(primary_org='legislature', name=name, district=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = councillor.xpath('./preceding-sibling::td//img/@src')[-1] contacts = page.xpath('.//td[@class="ms-rteTableOddCol-0"]//text()') for contact in contacts: if re.findall(r'[0-9]{4}', contact): phone = contact.strip().replace(' ', '-') p.add_contact('voice', phone, 'legislature') get_links(p, page.xpath('.//td[@class="ms-rteTableOddCol-0"]')[0]) email = self.get_email(page) p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="printArea"]//strong') for councillor in councillors: info = councillor.xpath('./parent::p/text()') if not info: info = councillor.xpath('./parent::div/text()') info = [x for x in info if x.strip()] district = re.sub(r'(?<=Ward \d).+', '', info.pop(0)) if 'Mayor' in district: district = 'Woolwich' role = 'Mayor' else: district = district.replace('Councillor', '').strip() role = 'Councillor' p = Person(primary_org='legislature', name=councillor.text_content(), district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./img/@src')[0] for contact in info: note, num = contact.split(':') num = num.strip().replace('(', '').replace(') ', '-').replace('extension ', 'x') p.add_contact(note, num, note) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[contains(@class, "ligne")]') for councillor in councillors: name = ' '.join(councillor.xpath('.//h3')[0].text_content().strip().split(', ')[::-1]) if 'vacant' in name: continue district = councillor.xpath('./preceding-sibling::h2/text()')[-1] if 'Mairie' in district: district = 'Québec' role = 'Maire' else: text = councillor.xpath('.//a[@target="_blank"]/text()') district = re.search('\ADistrict électoral (?:de|du|des) (.+) - ?\d+\Z', text[0].strip().replace('\xa0', ''), flags=re.U).group(1) role = 'Conseiller' if district == 'Monts': district = 'Les Monts' elif district == 'Plateau': district = 'Le Plateau' else: district = re.sub('–', '—', district) # n-dash, m-dash district = re.sub('\Ala ', 'La ', district) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./p//img/@src')[0] phone = self.get_phone(councillor, area_codes=[418]) p.add_contact('voice', phone, 'legislature') yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8') members = page.xpath('//table/tbody/tr') assert len(members), 'No members found' for row in members: riding, table_name, email = (' '.join(td.text_content().split()) for td in row[1:]) if 'Vacant' in table_name: continue district = riding.replace('\x97', '-') name_with_status, party_abbr = re.match(r'(.+) \((.+)\)', table_name).groups() name = name_with_status.split(',')[0] photo_page_url = row[2][0].attrib['href'] photo_url = self.get_photo_url(photo_page_url) # @see https://en.wikipedia.org/wiki/Charlotte-Campobello if district == 'Saint Croix': district = 'Charlotte-Campobello' # @see https://en.wikipedia.org/wiki/Oromocto-Lincoln-Fredericton elif district == 'Oromocto-Lincoln-Fredericton': district = 'Oromocto-Lincoln' p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=get_party(party_abbr.strip()), image=photo_url) p.add_contact('email', email) p.add_source(photo_page_url) p.add_source(COUNCIL_PAGE) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'utf-8') councillors = page.xpath('//div[contains(@class, "member-box member-box--")]') assert len(councillors), 'No councillors found' for councillor in councillors: name = councillor.xpath('.//div[@class="fiche__name"]/text()')[0] phone = councillor.xpath('.//div[@class="fiche__social"]/span/text()')[0].split('T')[1] email_mailto = councillor.xpath('.//div[@class="fiche__social"]/a[contains(@href, "mailto")]/@href') photo_url = councillor.xpath('.//img')[0].attrib['src'] page = self.lxmlize(councillor.xpath('.//a[@class="member-box__calltoaction"]/@href')[0]) district = page.xpath('.//div[@class="fiche__category"]/text()')[0] if district == 'Maire': district = 'Terrebonne' role = 'Maire' else: district = 'District {}'.format(district) role = 'Conseiller' p = Person(primary_org='legislature', name=name, district=district, role=role, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') if email_mailto: email = email_mailto[0].split('mailto:')[1] p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) members = page.xpath('//table[1]//tr') assert len(members), 'No members found' for member in members: if not member.text_content().strip(): continue name = member.xpath('./td[2]//a[1]//text()')[0] district_name = member.xpath( './td[2]//a[contains(.//text(), "MLA")]//text()')[0].split( ':')[1].replace('St ', 'St. ').split('-') district = district_name[0].strip() + '-' + district_name[1].strip( ) url = member.xpath('./td[2]//a[1]/@href')[0] ext_infos = self.scrape_extended_info(url) p = Person(primary_org='legislature', name=name, district=district, role='MLA') p.add_source(COUNCIL_PAGE) p.add_source(url) if ext_infos: # member pages might return errors email, phone, photo_url = ext_infos p.image = photo_url if email: p.add_contact('email', email) if phone: p.add_contact('voice', phone, 'legislature') yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) sections = page.xpath('//div[contains(@class, "membres-conseil-municipal")]') for section in sections: councillors = section.xpath('./div') assert len(councillors), 'No councillors found' for councillor in councillors: name = ' '.join(reversed(councillor.xpath('./h3//text()'))) if 'vacant' in name.lower(): continue header = section.xpath('./preceding-sibling::h2/text()')[-1] if 'Mairie' in header: district = 'Québec' role = 'Maire' else: district = councillor.xpath('./p[@itemprop="jobTitle"]/a/text()')[0] district = re.search(r'\ADistrict (?:de(?: la)?|du|des) ([\w —–-]+)', district, flags=re.U).group(1) role = 'Conseiller' if district == 'Saules': district = 'Les Saules' else: district = re.sub(r'–', '—', district) # n-dash, m-dash p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./figure//@src')[0] p.add_contact('voice', self.get_phone(councillor, area_codes=[418]), 'legislature') yield p
def scrape(self): councillor_seat_number = 1 regional_councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table//td') assert len(councillors), 'No councillors found' for councillor in councillors: if councillor.xpath('./p[1]/text()'): name, role = councillor.xpath('./p[1]/text()') else: name, role = councillor.xpath('./span[1]/text()') role = role.strip() if role == 'City Councillor': role = 'Councillor' district = 'Oshawa (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 elif role == 'Regional and City Councillor': role = 'Regional Councillor' district = 'Oshawa (seat {})'.format(regional_councillor_seat_number) regional_councillor_seat_number += 1 else: district = 'Oshawa' photo_url = councillor.xpath('./p/img/@src')[0] phone = self.get_phone(councillor.xpath('./p[contains(.//text(), "Phone")]')[0], area_codes=[905]) p = Person(primary_org='legislature', name=name, district=district, role=role, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('email', self.get_email(councillor)) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table[@id="Table1table"]/tbody/tr') assert len(councillors), 'No councillors found' for councillor in councillors: name = councillor.xpath('./td[2]/p/text()')[1] role = councillor.xpath('./td[2]/p/text()')[0].strip() if role == 'Mayor and Regional Councillor': role = 'Mayor' elif role == 'Local & Regional Councillor': role = 'Regional Councillor' elif role == 'Local Councillor': role = 'Councillor' if len(councillor.xpath('./td[2]/p/text()')) < 3: district = 'Milton' else: district = councillor.xpath('./td[2]/p/text()')[2] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./td[1]/p//img/@src')[0] if councillor == councillors[0]: address = ', '.join(councillor.xpath('./td[3]/p[1]/text()')).replace('Email:', '').strip() p.add_contact('address', address, 'legislature') numbers = councillor.xpath('./td[3]/p[2]/text()') for number in numbers: num_type, number = number.split(':') number = number.replace(', ext ', ' x').strip() p.add_contact(num_type, number, num_type) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath( '//section[contains(@id, "js-council-member")]') assert len(councillors), 'No councillors found' for index, councillor in enumerate(councillors): name = ' '.join(councillor.xpath('.//h2/text()')) district = councillor.xpath( './/span[contains(@class, "c-info-list_label")][contains(text(), "District ")]' ) role = 'Conseiller' if not district and index == 0: district = 'Pointe-Claire' role = 'Maire' elif district: district = district[0].text_content().split(' – ')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.image = councillor.xpath('.//@src')[0] p.add_contact('email', self.get_email(councillor)) p.add_contact('voice', self.get_phone(councillor, area_codes=[514]), 'legislature') p.add_source(COUNCIL_PAGE) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) yield self.scrape_mayor() councillors = page.xpath('//h2[@class="landing-block-title"]/a')[:-1] for councillor in councillors: url = councillor.attrib['href'] page = self.lxmlize(url) district = page.xpath('//div[@id="main-content"]/h1/text()')[0] name = page.xpath('//div[@id="main-content"]/h2/text()')[0] p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) contacts = page.xpath('//aside[@class="page-sidebar"]/div[1]/p') for contact in contacts[:-1]: contact_type = contact.xpath('./strong/text()')[0] if 'Contact' in contact_type: continue value = contact.xpath('./a/text()')[0] if 'Fax' in contact_type: p.add_contact('fax', value, 'legislature') if 'Phone' in contact_type: p.add_contact(contact_type, value, contact_type) yield p
def scrape(self): member_page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8') table = member_page.xpath('//table')[0] rows = table.xpath('.//tr')[1:] assert len(rows), 'No members found' for row in rows: (namecell, constitcell, partycell) = row.xpath('.//td') full_name = namecell.text_content().strip() if full_name.lower() == 'vacant': continue (last, first) = full_name.split(',') name = first.replace('Hon.', '').strip() + ' ' + last.title().strip() district = ' '.join(constitcell.text_content().split()) party = get_party(partycell.text) url = namecell.xpath('.//a')[0].get('href') page = self.lxmlize(url) email = self.get_email(page) p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=party) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email) image = page.xpath('//img[@class="page_graphic"]/@src') if image: p.image = image[0] yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) for block in page.xpath('//div[@class="addressblock"]'): name_elem = block.xpath('.//a[@class="mpp"]')[0] name = ' '.join(name_elem.text.split()) riding = block.xpath('.//div[@class="riding"]//text()')[0].strip().replace('--', '\u2014') district = riding.replace('Chatham—Kent', 'Chatham-Kent') # m-dash to hyphen mpp_url = name_elem.attrib['href'] mpp_page = self.lxmlize(mpp_url) image = mpp_page.xpath('//img[@class="mppimg"]/@src') party = mpp_page.xpath('//div[@class="mppinfoblock"]/p[last()]/text()')[0].strip() p = Person(primary_org='legislature', name=name, district=district, role='MPP', party=party) if image: p.image = image[0] p.add_source(COUNCIL_PAGE) p.add_source(mpp_url) email = block.xpath('.//div[@class="email"]') if email: p.add_contact('email', self.get_email(email[0])) phone = block.xpath('.//div[@class="phone"]//text()') if phone: p.add_contact('voice', phone[0], 'legislature') yield p
def scrape(self): seat_numbers = defaultdict(int) page = self.lxmlize(COUNCIL_PAGE) yield self.scrape_mayor() councillors = page.xpath('//div[@id="centre_content"]//tr') for councillor in councillors: if 'Position' in councillor.text_content(): continue ward = councillor.xpath('./td')[0].text_content().replace('Councillor', '') seat_numbers[ward] += 1 district = '{} (seat {})'.format(ward, seat_numbers[ward]) name = councillor.xpath('./td')[1].text_content() url = councillor.xpath('./td/a')[0].attrib['href'] p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) page = self.lxmlize(url) content = page.xpath('//div[@id="centre_content"]')[0] email = self.get_email(content) p.add_contact('email', email) p.add_contact('voice', self.get_phone(content, area_codes=[226, 519]), 'legislature') p.image = page.xpath('string(//div[@id="centre_content"]//img/@src)') # can be empty if len(page.xpath('//div[@id="centre_content"]//a')) > 2: p.add_link(page.xpath('//div[@id="centre_content"]//a')[-1].attrib['href']) yield p
def scrape(self): regional_councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) yield self.scrape_mayor(page) councillor_nodes = page.xpath('//h3[contains(text(), "Councillors")]/following-sibling::p')[:-1] for councillor_node in councillor_nodes: text = ' '.join(councillor_node.xpath('./strong/text()')) if not text or 'Vacant' in text: continue name, role_district = text.split(', ', 1) if 'Regional Councillor' in role_district: role = role_district district = 'Whitby (seat {})'.format(regional_councillor_seat_number) regional_councillor_seat_number += 1 else: role, district = role_district.strip().split(', ') district = district.split(' (')[0] email = self.get_email(councillor_node) image = councillor_node.xpath('./img/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role=role, image=image) p.add_source(COUNCIL_PAGE) p.add_contact('email', email) yield p
def scrape(self): councillor_seat_number = 1 coun_page = self.lxmlize(COUNCIL_PAGE) contact_page = self.lxmlize(CONTACT_PAGE) councillors = coun_page.xpath('//div[@id="main-content"]//h3') contact_data = contact_page.xpath('//p[contains(./strong/text(), "Mayor & Council")]/following-sibling::table[1]//tr')[1:] for councillor, contact in zip(councillors, contact_data): text = councillor.text_content() if text.startswith('Councill'): role = 'Councillor' district = 'Abbotsford (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 else: role = 'Mayor' district = 'Abbotsford' name = text.split(' ', 1)[1] image = councillor.xpath('./img/@src')[0] phone = contact.xpath('./td[2]/text()')[0] fax = contact.xpath('./td[3]/text()')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(CONTACT_PAGE) p.image = image p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1') yield self.scrape_mayor() councillors = page.xpath('//div[@class="articlebody-inside"]//p[contains(text(),"-")]') for councillor in councillors: url = councillor.xpath('.//a')[0].attrib['href'].replace('../', '') page = self.lxmlize(url, 'iso-8859-1') name = page.xpath('//div[@class="articletitle"]/h1')[0].text_content().replace('Councillor', '').replace('Deputy Mayor', '') district = 'Ward {}'.format(re.sub(r'\D+', '', page.xpath('//div[@class="articlebody-inside"]/p')[0].text_content())) p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) photo_url_rel = page.xpath('//div[@class="articlebody-inside"]/p/img/@src')[0].replace('/..', '') p.image = urljoin(url, photo_url_rel) contacts = page.xpath('//div[@class="articlebody-inside"]/p')[1].text_content().replace('Biography', '').replace('Committees', '').split(':') for i, contact in enumerate(contacts): if i == 0 or not contact: continue contact_type = re.findall(r'([A-Z][a-z]+)', contacts[i - 1])[0] if contact_type != 'Address': contact = re.split(r'[A-Z]', contact)[0] contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type] p.add_contact(contact_type, contact, '' if contact_type == 'email' else 'legislature') yield p
def scrape(self): regional_councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//a[@title="Mayor and Council::Meet Your Council"]/following-sibling::ul//@href') assert len(councillors), 'No councillors found' for councillor in councillors: node = self.lxmlize(councillor).xpath('//div[@id="printArea"]')[0] name = node.xpath('.//h1/text()')[0] if 'Mayor' in name: role = 'Mayor' district = 'Whitby' name = name.replace('Mayor ', '') else: role = node.xpath('.//h2/text()')[0] if 'Regional Councillor' in role: district = 'Whitby (seat {})'.format(regional_councillor_seat_number) regional_councillor_seat_number += 1 else: role, district = role.split(', ') district = district.split(' (')[0] image = node.xpath('.//img/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('voice', self.get_phone(node), 'legislature') p.add_contact('email', self.get_email(node)) p.image = image yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath( '//div[contains(@class, "view-people")]//div[contains(@class, "views-row")]' ) assert len(councillors), 'No councillors found' for councillor in councillors: name = councillor.xpath( './/div[@property="dc:title"]')[0].text_content() role_and_district = councillor.xpath( './/div[contains(@class, "field-name-field-sub-title")]//p' )[-2].text_content().replace('\xa0', ' ') if role_and_district == 'Mayor': district = 'Fredericton' role = 'Mayor' else: district = role_and_district.split(', ', 1)[1] role = 'Councillor' url = councillor.xpath('.//@href')[0] page = self.lxmlize(url) p = Person(primary_org='legislature', name=name, district=district, role=role) p.image = councillor.xpath('.//img[@typeof="foaf:Image"]/@src')[0] p.add_contact('email', self.get_email(page)) p.add_contact('voice', self.get_phone(page, area_codes=[506]), 'legislature') p.add_source(COUNCIL_PAGE) p.add_source(url) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table//td[*]') assert len(councillors), 'No councillors found' for councillor in councillors: district, role, name = councillor.xpath('./p[1]/text()') role = role.strip() if district == 'City of Oshawa': district = 'Oshawa' if role == 'City Councillor': role = 'Councillor' elif role == 'Regional & City Councillor': role = 'Regional Councillor' photo_url = councillor.xpath('./p/img/@src')[0] phone = self.get_phone( councillor.xpath('./p[contains(.//text(), "Phone")]')[0], area_codes=[905]) p = Person(primary_org='legislature', name=name, district=district, role=role, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('email', self.get_email(councillor)) yield p
def councillor_data(self, url, name, ward): page = self.lxmlize(url) # sadly, email is a form on a separate page photo_url_rel = page.xpath( '//div[contains(@id, "contentcontainer")]//img/@src')[0] photo_url = urljoin(url, photo_url_rel) m = Person(primary_org='legislature', name=name, district=ward, role='Councillor') m.add_source(COUNCIL_PAGE) m.add_source(url) phone = self.get_phone(page.xpath('//div[@id="contentcontainer"]')[0], area_codes=[306], error=False) if phone: m.add_contact('voice', phone, 'legislature') else: phone = self.get_phone( page.xpath('//div[@id="lowercontentcontainer"]')[0], area_codes=[306], error=False) if phone: m.add_contact('voice', phone, 'legislature') m.image = photo_url yield m
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'utf-8') yield self.scrape_mayor(page) trs = page.xpath('//tbody/tr') assert len(trs), 'No councillors found' seat_number = 1 for tr in trs: if tr.xpath('./td[2]//text()')[0] != 'Vacant': district = tr.xpath('./td[1]/text()')[0] if 'Greenfield Park' in district or 'Conseiller n' in district: district = 'Greenfield Park (siège {})'.format(seat_number) seat_number += 1 detail_url = tr.xpath('./td[2]/a/@href')[0] detail_page = self.lxmlize(detail_url, 'utf-8') name = detail_page.xpath('//h1/text()')[0] photo_node = detail_page.xpath( '//img[contains(@alt, "{0}")]/@src'.format(name)) if photo_node: photo_url = photo_node[0] else: photo_url = detail_page.xpath( '//img[contains(@class, "droite")]/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(detail_url) p.image = photo_url p.add_contact('email', self.get_email(detail_page)) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="block text"]') assert len(councillors), 'No councillors found' for i, councillor in enumerate(councillors): name = councillor.xpath( './/div[@class="content-writable"]//strong/text()')[0] district = councillor.xpath('.//h2/text()')[0] if 'Maire' in district: district = 'Sainte-Anne-de-Bellevue' role = 'Maire' else: district = 'District {}'.format(re.search(r'\d+', district)[0]) role = 'Conseiller' p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('.//@src')[0] p.add_contact('email', self.get_email(councillor)) yield p
def scrape(self): councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="content"]//table//tr[position() mod 2 = 1]') assert len(councillors), 'No councillors found' for councillor in councillors: text = councillor.xpath('.//strong/text()')[0] if 'Deputy Warden' in text: role = 'Deputy Warden' name = text.replace('Deputy Warden', '') district = 'Lambton' elif 'Warden' in text: role = 'Warden' name = text.replace('Warden', '') district = 'Lambton' else: role = 'Councillor' name = text district = 'Lambton (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('.//img/@src')[0] p.add_contact('email', self.get_email(councillor)) yield p