def scrape(self): regional_councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) yield self.scrape_mayor(page) councillor_nodes = page.xpath('//h3[contains(text(), "Councillors")]/following-sibling::p')[:-1] for councillor_node in councillor_nodes: text = ' '.join(councillor_node.xpath('./strong/text()')) if not text or 'Vacant' in text: continue name, role_district = text.split(', ', 1) if 'Regional Councillor' in role_district: role = role_district district = 'Whitby (seat {})'.format(regional_councillor_seat_number) regional_councillor_seat_number += 1 else: role, district = role_district.strip().split(', ') district = district.split(' (')[0] email = self.get_email(councillor_node) image = councillor_node.xpath('./img/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role=role, image=image) p.add_source(COUNCIL_PAGE) p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) for block in page.xpath('//div[@class="addressblock"]'): name_elem = block.xpath('.//a[@class="mpp"]')[0] name = ' '.join(name_elem.text.split()) riding = block.xpath('.//div[@class="riding"]//text()')[0].strip().replace('--', '\u2014') district = riding.replace('Chatham—Kent', 'Chatham-Kent') # m-dash to hyphen mpp_url = name_elem.attrib['href'] mpp_page = self.lxmlize(mpp_url) image = mpp_page.xpath('//img[@class="mppimg"]/@src') party = mpp_page.xpath('//div[@class="mppinfoblock"]/p[last()]/text()')[0].strip() p = Person(primary_org='legislature', name=name, district=district, role='MPP', party=party) if image: p.image = image[0] p.add_source(COUNCIL_PAGE) p.add_source(mpp_url) email = block.xpath('.//div[@class="email"]') if email: p.add_contact('email', self.get_email(email[0])) phone = block.xpath('.//div[@class="phone"]//text()') if phone: p.add_contact('voice', phone[0], 'legislature') yield p
def scrape(self): self.user_agent = CUSTOM_USER_AGENT page = self.get(COUNCIL_PAGE) members = re.findall('/Members/YourMember/[^"]+', page.text) assert len(members), 'No members found' for member in members: detail_url = 'http://www.assembly.nl.ca%s' % member detail = self.lxmlize(detail_url, user_agent=CUSTOM_USER_AGENT) name = detail.xpath('//h1/text()')[0] district = re.sub(r' [\xa0–-] ', '—', detail.xpath('//h2/text()')[0]) # # n-dash, m-dash party = PARTIES[detail.xpath('//h3/text()')[0]] p = Person(primary_org='legislature', name=name, district=district, role='MHA', party=party) p.image = detail.xpath('//img[@class="img-responsive"]/@src')[0] contact = detail.xpath('//div[@class="col-md-12"]')[0] p.add_contact('email', self.get_email(contact)) p.add_source(COUNCIL_PAGE) p.add_source(detail_url) for heading, _type in HEADING_TYPE.items(): node = detail.xpath('//b[.="%s"]/../..' % heading) if node: phone = self.get_phone(node[0], error=False) if phone: p.add_contact('voice', phone, _type) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="article-content"]//td[@class="ms-rteTableOddCol-0"]') yield self.scrape_mayor(councillors[0]) assert len(councillors), 'No councillors found' for councillor in councillors[1:]: if not councillor.xpath('.//a'): continue texts = [text for text in councillor.xpath('.//text()') if clean_string(text)] name = texts[0] district = texts[1] url = councillor.xpath('.//a/@href')[0] page = self.lxmlize(url) p = Person(primary_org='legislature', name=name, district=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = councillor.xpath('./preceding-sibling::td//img/@src')[-1] contacts = page.xpath('.//td[@class="ms-rteTableOddCol-0"]//text()') for contact in contacts: if re.findall(r'[0-9]{4}', contact): phone = contact.strip().replace(' ', '-') p.add_contact('voice', phone, 'legislature') get_links(p, page.xpath('.//td[@class="ms-rteTableOddCol-0"]')[0]) email = self.get_email(page) p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) members = page.xpath('//table[1]//tr') assert len(members), 'No members found' for member in members: if not member.text_content().strip(): continue name = member.xpath('./td[2]//a[1]//text()')[0] district_name = member.xpath('./td[2]//a[contains(.//text(), "MLA")]//text()')[0].split(':')[1].replace('St ', 'St. ').split('-') district = district_name[0].strip() + '-' + district_name[1].strip() url = member.xpath('./td[2]//a[1]/@href')[0] ext_infos = self.scrape_extended_info(url) p = Person(primary_org='legislature', name=name, district=district, role='MLA') p.add_source(COUNCIL_PAGE) p.add_source(url) if ext_infos: # member pages might return errors email, phone, photo_url = ext_infos p.image = photo_url if email: p.add_contact('email', email) if phone: p.add_contact('voice', phone, 'legislature') yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1') general_contacts = page.xpath('//p[@class="large_title"]/following-sibling::p/text()') general_phone = general_contacts[0] general_fax = general_contacts[1] councillors = page.xpath('//tr/td/p/b') for councillor in councillors: text = councillor.text_content() if '@' in text or 'NEWSLETTER' in text: continue if 'Mayor' in text: name = text.replace('Mayor', '') district = 'Dollard-Des Ormeaux' role = 'Maire' else: name = re.split(r'[0-9]', text)[1] district = 'District ' + re.findall(r'[0-9]', text)[0] role = 'Conseiller' p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./parent::p/parent::td/parent::tr/preceding-sibling::tr//img/@src')[0] email = self.get_email(councillor, './parent::p/following-sibling::p') p.add_contact('email', email) p.add_contact('voice', general_phone, 'legislature') p.add_contact('fax', general_fax, 'legislature') yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="c2087"]//a') for councillor in councillors: name = councillor.text_content() url = councillor.attrib['href'] page = self.lxmlize(url) if 'Maire' in page.xpath('//h2/text()')[0]: district = 'Sherbrooke' role = 'Maire' else: district = page.xpath('//div[@class="csc-default"]//a[@target="_blank"]/text()')[0].replace('district', '').replace('Domaine Howard', 'Domaine-Howard').strip() role = 'Conseiller' if district in ('de Brompton', 'de Lennoxville'): district = district.replace('de ', '') p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('//div[@class="csc-textpic-image csc-textpic-last"]//img/@src')[0] parts = page.xpath('//li[contains(text(), "phone")]/text()')[0].split(':') note = parts[0] phone = parts[1] p.add_contact(note, phone, note) email = self.get_email(page) if email: p.add_contact('email', email) if district == 'Brompton': p._related[0].extras['boundary_url'] = '/boundaries/sherbrooke-boroughs/brompton/' elif district == 'Lennoxville': p._related[0].extras['boundary_url'] = '/boundaries/sherbrooke-boroughs/lennoxville/' yield p
def scrape(self): councillor_seat_number = 1 contact_page = self.lxmlize(CONTACT_URL) email = self.get_email(contact_page) page = self.lxmlize(COUNCIL_PAGE) for url in page.xpath('//a/@href[contains(., "members/")]'): page = self.lxmlize(url) role, name = page.xpath('//h1//text()')[0].split(' ', 1) photo_url = page.xpath('//img/@src')[0] if role == 'Mayor': district = 'Richmond' else: district = 'Richmond (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 p = Person(primary_org='legislature', name=name, district=district, role=role) p.image = photo_url p.add_source(COUNCIL_PAGE) p.add_source(CONTACT_URL) p.add_source(url) p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) mayor_url = page.xpath('//a[contains(text(), "Mayor")]/@href')[0] mayor = self.scrape_mayor(mayor_url) if mayor: yield mayor councillors_url = page.xpath('//a[contains(text(), "Councillors")]/@href')[0] cpage = self.lxmlize(councillors_url) councillor_rows = cpage.xpath("//tr[td//img]")[:-1] for councillor_row in councillor_rows: img_cell, info_cell = tuple(councillor_row) if info_cell.xpath('.//p//text()[contains(., "Vacant")]'): continue name = info_cell.xpath('.//p//text()[contains(., "Councillor")]')[0].replace("Councillor ", "") district = info_cell.xpath('.//p[contains(text(), "District")]//text()')[0] email = self.get_email(info_cell) phone = self.get_phone(info_cell, area_codes=[438, 514]) img_url_rel = img_cell.xpath(".//img/@src")[0] img_url = urljoin(councillors_url, img_url_rel) p = Person(primary_org="legislature", name=name, district=district, role="Conseiller") p.add_source(COUNCIL_PAGE) p.add_source(councillors_url) p.add_contact("email", email) p.add_contact("voice", phone, "legislature") p.image = img_url yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT) mayor_url = page.xpath('//a[contains(text(), "Mayor")]/@href')[0] mayor = self.scrape_mayor(mayor_url) if mayor: yield mayor councillors_url = page.xpath('//a[contains(text(), "Councillors")]/@href')[0] cpage = self.lxmlize(councillors_url, user_agent=CUSTOM_USER_AGENT) councillors = cpage.xpath('//tr[td//img]')[:-1] assert len(councillors), 'No councillors found' for councillor_row in councillors: img_cell, info_cell = tuple(councillor_row) if info_cell.xpath('.//p//text()[contains(., "Vacant")]'): continue cells = [x.strip() for x in info_cell.xpath('.//text()') if re.sub('\xa0', ' ', x).strip()] name = cells[0].replace('Councillor ', '') district = info_cell.xpath('.//p[contains(text(), "District")]//text()')[0] email = self.get_email(info_cell) phone = self.get_phone(info_cell, area_codes=[438, 514], error=False) img_url_rel = img_cell.xpath('.//img/@src')[0] img_url = urljoin(councillors_url, img_url_rel) p = Person(primary_org='legislature', name=name, district=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(councillors_url) p.add_contact('email', email) if phone: p.add_contact('voice', phone, 'legislature') p.image = img_url yield p
def scrape(self): councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[contains(@class, "entry")]')[0].xpath('.//@href') assert len(councillors), 'No councillors found' for url in councillors: if '@' in url: continue page = self.lxmlize(url) main = page.xpath('//main[@id="content"]')[0] name = main.xpath('.//h1//text()')[0] if 'Mayor' in main.text_content(): name = name.replace('Mayor ', '') role = 'Mayor' district = 'Saanich' else: role = 'Councillor' district = 'Saanich (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 p = Person(primary_org='legislature', name=name, district=district, role=role) p.image = page.xpath('.//@src')[0] p.add_contact('voice', self.get_phone(page, area_codes=[250]), 'legislature') p.add_contact('email', self.get_email(page.xpath('//main[@id="content"]')[0])) p.add_source(COUNCIL_PAGE) p.add_source(url) yield p
def scrape(self): def char(code): try: return chr(int(code)) except ValueError: return code page = self.lxmlize(COUNCIL_PAGE) for row in page.xpath('//div[@id="content"]/table/tbody/tr'): if 'Vacant' not in row.xpath('./td//text()')[0]: full_name, party, district = row.xpath('./td//text()')[:3] name = ' '.join(reversed(full_name.split(','))) p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=self.PARTIES[party]) detail_url = row[0][0].attrib['href'] detail = self.lxmlize(detail_url) image = detail.xpath('//img[@class="portrait"]/@src')[0] p.image = image try: p.add_contact('voice', detail.xpath('//dd[@class="numbers"]/text()')[0].split(': ')[1], 'legislature') except IndexError: pass script = detail.xpath('//dd/script/text()') if script: codes = reversed(re.findall(r"]='(.+?)'", script[0])) content = ''.join(char(code) for code in codes) p.add_contact('email', re.search(r'>(.+)<', content).group(1)) p.add_source(COUNCIL_PAGE) p.add_source(detail_url) yield p
def scrape_mayor(self, url): page = self.lxmlize(url) name = page.xpath('//img/@alt[contains(., "Mayor")]')[0].split(' ', 1)[1] p = Person(primary_org='legislature', name=name, district='Markham', role='Mayor') p.add_source(url) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) corrections = { 'Mackenzie Delta': 'Mackenzie-Delta', 'Tu Nedhe - Wiilideh': 'Tu Nedhe', } member_cells = page.xpath('//div[@class="views-field views-field-field-picture"]/parent::td') for cell in member_cells: name = cell[1].text_content().replace(' .', '. ') # typo on page riding = cell[2].text_content().strip() riding = corrections.get(riding, riding) detail_url = cell[0].xpath('.//a/@href')[0] detail_page = self.lxmlize(detail_url) photo_url = detail_page.xpath('//div[@class="field-item even"]/img/@src')[0] email = self.get_email(detail_page) contact_text = ''.join(detail_page.xpath('//div[@property="content:encoded"]/p[1]//text()')) phone = re.search(r'P(hone)?: ([-0-9]+)', contact_text) p = Person(primary_org='legislature', name=name, district=riding, role='MLA', image=photo_url) p.add_source(COUNCIL_PAGE) p.add_source(detail_url) p.add_contact('email', email) if phone: p.add_contact('voice', phone.group(2), 'legislature') yield p
def scrape(self): councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) for person_url in page.xpath('//h4/a/@href'): page = self.lxmlize(person_url) role, name = page.xpath('//title//text()')[0].split(' ', 1) photo_url = page.xpath('//div[@id="content"]//img[@style]/@src')[0] contact_node = page.xpath('//div[@id="column-right"]//div[contains(., "Contact")]') if contact_node: email = self.get_email(contact_node[0]) phone = self.get_phone(contact_node[0], area_codes=[604, 778]) if role == 'Mayor': district = 'Burnaby' else: district = 'Burnaby (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 p = Person(primary_org='legislature', name=name, district=district, role=role, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_source(person_url) p.add_contact('email', email) if phone: p.add_contact('voice', phone, 'legislature') yield p
def scrape(self): csv_text = self.get(self.get_csv_url()).text cr = csv.DictReader(csv_text.split('\n')) for mla in cr: name = '{} {} {}'.format(mla['MLA First Name'], mla['MLA Middle Names'], mla['MLA Last Name']) if name.strip() == '': continue party = get_party(mla['Caucus']) name_without_status = name.split(',')[0] detail_url = ( 'http://www.assembly.ab.ca/net/index.aspx?' 'p=mla_contact&rnumber={0}&leg=29'.format( mla['Riding Number'] ) ) detail_page = self.lxmlize(detail_url) photo_url = detail_page.xpath('//img[@class="MemPhoto"]/@src')[0] p = Person( primary_org='legislature', name=name_without_status, district=mla['Riding Name'], role='MLA', party=party, image=photo_url, ) p.add_source(COUNCIL_PAGE) p.add_source(detail_url) if mla['Email']: p.add_contact('email', mla['Email']) if mla['Phone Number']: p.add_contact('voice', mla['Phone Number'], 'legislature') yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) yield self.scrape_mayor() councillors = page.xpath('//h2[@class="landing-block-title"]/a')[:-1] for councillor in councillors: url = councillor.attrib['href'] page = self.lxmlize(url) district = page.xpath('//div[@id="main-content"]/h1/text()')[0] name = page.xpath('//div[@id="main-content"]/h2/text()')[0] p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) contacts = page.xpath('//aside[@class="page-sidebar"]/div[1]/p') for contact in contacts[:-1]: contact_type = contact.xpath('./strong/text()')[0] if 'Contact' in contact_type: continue value = contact.xpath('./a/text()')[0] if 'Fax' in contact_type: p.add_contact('fax', value, 'legislature') if 'Phone' in contact_type: p.add_contact(contact_type, value, contact_type) yield p
def scrape_mayor(self): page = self.lxmlize(MAYOR_PAGE) name = page.xpath('//h1[contains(text(), "Mayor")]/text()')[0].replace('Mayor', '').strip() p = Person(primary_org='legislature', name=name, district='Edmonton', role='Mayor') p.add_source(MAYOR_PAGE) address = ' '.join(page.xpath('//address/p/text()')) p.add_contact('address', address, 'legislature') return p
def scrape(self): response = urlopen(COUNCIL_PAGE).read() pdf = open('/tmp/ns.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output(['pdftotext', '/tmp/ns.pdf', '-']) emails = re.findall(r'(?<=E-mail: ).+', data) data = re.split(r'Mayor |Warden ', data)[1:] for i, mayor in enumerate(data): lines = mayor.splitlines(True) name = lines.pop(0).strip() if name == "Jim Smith": continue district = lines.pop(0).strip() if not re.findall(r'[0-9]', lines[0]): district = district + ' ' + lines.pop(0).strip() org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) yield org p = Person(primary_org='legislature', name=name, district=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(org, role='Mayor', district=district) address = lines.pop(0).strip() + ', ' + lines.pop(0).strip() if 'Phone' not in lines[0]: address = address + ', ' + lines.pop(0).strip() if 'Phone' not in lines[0]: address = address + ', ' + lines.pop(0).strip() phone = lines.pop(0).split(':')[1].strip() if 'Fax' in lines.pop(0): fax = lines.pop(0) membership.add_contact_detail('address', address, 'legislature') membership.add_contact_detail('voice', phone, 'legislature') membership.add_contact_detail('fax', fax, 'legislature') # @todo emails are being assigned incorrectly, e.g. Town of Berwick picks # up Cape Breton Regional Municipality and Region of Queens Municipality for i, email in enumerate(emails): regex = name.split()[-1].lower() + '|' + '|'.join(district.split()[-2:]).replace('of', '').lower() regex = regex.replace('||', '|') matches = re.findall(r'{}'.format(regex), email) if matches: membership.add_contact_detail('email', emails.pop(i)) yield p os.system('rm /tmp/ns.pdf')
def scrape(self): councillor_seat_number = 1 coun_page = self.lxmlize(COUNCIL_PAGE) contact_page = self.lxmlize(CONTACT_PAGE) councillors = coun_page.xpath('//div[@id="main-content"]//h3') contact_data = contact_page.xpath('//p[contains(./strong/text(), "Mayor & Council")]/following-sibling::table[1]//tr')[1:] for councillor, contact in zip(councillors, contact_data): text = councillor.text_content() if text.startswith('Councill'): role = 'Councillor' district = 'Abbotsford (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 else: role = 'Mayor' district = 'Abbotsford' name = text.split(' ', 1)[1] image = councillor.xpath('./img/@src')[0] phone = contact.xpath('./td[2]/text()')[0] fax = contact.xpath('./td[3]/text()')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(CONTACT_PAGE) p.image = image p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillor_trs = [tr for tr in page.xpath('//table//tr[1]') if len(tr) == 2][:-1] for councillor_tr in councillor_trs: desc = [text.strip() for text in councillor_tr.xpath('.//text()[normalize-space()]') if text.strip()] if len(desc) == 3: role = 'Maire' district = 'Saint-Jérôme' else: role = 'Conseiller' district = desc[0].replace('numéro ', '') name = desc[-3] phone = desc[-2] email = desc[-1] image = councillor_tr.xpath('.//img/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = image p.add_contact('voice', phone, 'legislature') p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1') councillors = page.xpath('//div[@id="PageContent"]/table/tbody/tr/td') for councillor in councillors: if not councillor.text_content().strip(): continue if councillor == councillors[0]: district = 'Kirkland' role = 'Maire' else: district = councillor.xpath('.//h2')[0].text_content() district = re.search('- (.+)', district).group(1).strip() district = district.replace(' Ouest', ' ouest').replace(' Est', ' est') role = 'Conseiller' name = councillor.xpath('.//strong/text()')[0] phone = councillor.xpath('.//div[contains(text(), "#")]/text()')[0].replace('T ', '').replace(' ', '-').replace(',-#-', ' x') email = self.get_email(councillor) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email) p.image = councillor.xpath('.//img/@src')[0] yield p
def scrape(self): regional_councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//a[@title="Mayor and Council::Meet Your Council"]/following-sibling::ul//@href') assert len(councillors), 'No councillors found' for councillor in councillors: node = self.lxmlize(councillor).xpath('//div[@id="printArea"]')[0] name = node.xpath('.//h1/text()')[0] if 'Mayor' in name: role = 'Mayor' district = 'Whitby' name = name.replace('Mayor ', '') else: role = node.xpath('.//h2/text()')[0] if 'Regional Councillor' in role: district = 'Whitby (seat {})'.format(regional_councillor_seat_number) regional_councillor_seat_number += 1 else: role, district = role.split(', ') district = district.split(' (')[0] image = node.xpath('.//img/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('voice', self.get_phone(node), 'legislature') p.add_contact('email', self.get_email(node)) p.image = image yield p
def scrape(self): # mayor first, can't find email page = self.lxmlize(MAYOR_URL) photo_url = page.xpath('//img/@src[contains(., "maire")]')[0] name = page.xpath('//td[@class="contenu"]/text()[last()]')[0] p = Person(primary_org='legislature', name=name, district="Trois-Rivières", role="Maire", image=photo_url) p.add_source(MAYOR_URL) yield p resp = self.get(COUNCIL_PAGE) # page rendering through JS on the client page_re = re.compile(r'createItemNiv3.+"District (.+?)".+(index.+)\\"') for district, url_rel in page_re.findall(resp.text): if district not in ('des Estacades', 'des Plateaux', 'des Terrasses', 'du Sanctuaire'): district = re.sub('\A(?:de(?: la)?|des|du) ', '', district) url = urljoin(COUNCIL_PAGE, url_rel) page = self.lxmlize(url) name_content = page.xpath('//h2//text()') if name_content: name = name_content[0] email = self.get_email(page) photo_url = page.xpath('//img/@src[contains(., "Conseiller")]')[0] p = Person(primary_org='legislature', name=name, district=district, role='Conseiller', image=photo_url) p.add_source(url) p.add_contact('email', email) yield p
def scrape(self): councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) nodes = page.xpath('//div[@class="view-content"]/div') for node in nodes: fields = node.xpath('./div') role = fields[0].xpath('./div//text()')[0] name = fields[2].xpath('.//a//text()')[0].title().split(role)[-1].strip() if name == 'Vacant': continue if 'Ward' in role: district = role role = 'Councillor' else: if 'At Large' in role: role = 'Councillor at Large' district = "St. John's (seat {})".format(councillor_seat_number) councillor_seat_number += 1 else: district = "St. John's" phone = fields[3].xpath('./div//text()')[0] email = self.get_email(fields[5]) photo_url = node.xpath('.//img/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email) p.image = photo_url yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//p[@class="WSIndent"]/a') for councillor in councillors: district = re.findall(r'(Ward [0-9]{1,2})', councillor.text_content()) if district: district = district[0] name = councillor.text_content().replace(district, '').strip() role = 'Councillor' else: district = 'Kawartha Lakes' name = councillor.text_content().replace('Mayor', '').strip() role = 'Mayor' url = councillor.attrib['href'] page = self.lxmlize(url) email = self.get_email(page) image = page.xpath('//img[@class="image-right"]/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email) p.image = image yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//section[contains(@id, "js-council-member")]') assert len(councillors), 'No councillors found' for index, councillor in enumerate(councillors): name = ' '.join([n.strip() for n in councillor.xpath('.//h2/text()')]) district = councillor.xpath('.//span[contains(@class, "c-info-list_label")][contains(text(), "District ")]') role = 'Conseiller' if not district and index == 0: district = 'Pointe-Claire' role = 'Maire' elif district: district = district[0].text_content().split(' – ')[0].strip() else: assert False, "error parsing district" p = Person(primary_org='legislature', name=name, district=district, role=role) p.image = councillor.xpath('.//@src')[0] p.add_contact('email', self.get_email(councillor)) p.add_contact('voice', self.get_phone(councillor, area_codes=[514]), 'legislature') p.add_source(COUNCIL_PAGE) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1') yield self.scrape_mayor() councillors = page.xpath('//div[@class="articlebody-inside"]//p[contains(text(),"-")]') for councillor in councillors: url = councillor.xpath('.//a')[0].attrib['href'].replace('../', '') page = self.lxmlize(url, 'iso-8859-1') name = page.xpath('//div[@class="articletitle"]/h1')[0].text_content().replace('Councillor', '').replace('Deputy Mayor', '') district = 'Ward {}'.format(re.sub(r'\D+', '', page.xpath('//div[@class="articlebody-inside"]/p')[0].text_content())) p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) photo_url_rel = page.xpath('//div[@class="articlebody-inside"]/p/img/@src')[0].replace('/..', '') p.image = urljoin(url, photo_url_rel) contacts = page.xpath('//div[@class="articlebody-inside"]/p')[1].text_content().replace('Biography', '').replace('Committees', '').split(':') for i, contact in enumerate(contacts): if i == 0 or not contact: continue contact_type = re.findall(r'([A-Z][a-z]+)', contacts[i - 1])[0] if contact_type != 'Address': contact = re.split(r'[A-Z]', contact)[0] contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type] p.add_contact(contact_type, contact, '' if contact_type == 'email' else 'legislature') yield p
def scrape_mayor(self, url): page = self.lxmlize(url) name = page.xpath('//div[@id="printAreaContent"]/h1/strong/text()')[0].replace('Mayor', '').strip() address = page.xpath('//strong[contains(text(), "mail")]/parent::p/text()')[1].replace(':', '').strip() phone = page.xpath('//strong[contains(text(), "phone")]/parent::p/text()')[1].split()[1] p = Person(primary_org='legislature', name=name, district='Caledon', role='Mayor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('//h2[contains(text(), "About me")]/img/@src')[0] p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') return p
def scrape(self): member_page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8') table = member_page.xpath('//table')[0] rows = table.xpath('.//tr')[1:] assert len(rows), 'No members found' for row in rows: (namecell, constitcell, partycell) = row.xpath('.//td') full_name = namecell.text_content().strip() if full_name.lower() == 'vacant': continue (last, first) = full_name.split(',') name = first.replace('Hon.', '').strip() + ' ' + last.title().strip() district = ' '.join(constitcell.text_content().split()) party = get_party(partycell.text) url = namecell.xpath('.//a')[0].get('href') page = self.lxmlize(url) email = self.get_email(page) p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=party) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email) image = page.xpath('//img[@class="page_graphic"]/@src') if image: p.image = image[0] yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) types = page.xpath( '//div[@class="bluearrow shaded bottomborder "][1]/ul/li/a/@href' )[:4] for org_type, link in enumerate(types): page = self.lxmlize(link) district_urls = page.xpath( '//div[@class="parbase list section cplist"]/table/tr/td[1]/b/a/@href' ) for district_url in district_urls: page = self.lxmlize(district_url) district = page.xpath('//div[@class="pageHeader"]/h1/text()' )[0].split(' - ')[1].strip() org = Organization( name=district + org_types[org_type], classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(district_url) yield org address = ', '.join( page.xpath('//div[@class="left_contents"]/p[1]/text()')) contacts = page.xpath( '//div[@class="left_contents"]/p[b[text() = "Contact"]]/text()' ) phone = contacts[0].split(':')[1].strip().replace(' ', '-') fax = contacts[1].split(':')[1].strip().replace(' ', '-') email = self.get_email(page, '//div[@class="left_contents"]') site = page.xpath( '//div[@class="left_contents"]//a[not(contains(@href,"mailto:"))]' ) if site: site = site[0].text_content() councillors = page.xpath( '//div[@class="right_contents"]//p/text()') for i, councillor in enumerate(councillors): if 'Vacant' in councillor: continue p = Person(primary_org='legislature', name=councillor, district=district) p.add_source(COUNCIL_PAGE) p.add_source(link) p.add_source(district_url) if i == 0: membership = p.add_membership(org, role='Mayor') else: membership = p.add_membership(org, role='Councillor') membership.post_id = district membership.add_contact_detail('address', address, 'legislature') if phone: membership.add_contact_detail('voice', phone, 'legislature') if fax: membership.add_contact_detail('fax', fax, 'legislature') if email: membership.add_contact_detail('email', email) if site: p.add_link(site) yield p
def scrape_mayor(self, div): name = div.xpath('.//a')[0].text_content() url = div.xpath('.//a/@href')[0] page = self.lxmlize(url) contact_url = page.xpath('//a[@title="Joindre le maire"]/@href')[0] contact_page = self.lxmlize(contact_url) p = Person(primary_org='legislature', name=name, district='Saint-Jean-sur-Richelieu', role='Maire') p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_source(contact_url) p.image = div.xpath('./preceding-sibling::td//img/@src')[-1] contacts = contact_page.xpath( '//div[@id="ctl00_PlaceHolderMain_ctl01_ctl01__ControlWrapper_RichHtmlField"]//div/font/text()' ) address = ' '.join(contacts[:4]) phone = contacts[-3].split(':')[1].strip().replace(' ', '-') fax = contacts[-2].split(':')[1].strip().replace(' ', '-') p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') # mayor's email is a form return p
def scrape(self): councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) mayor = page.xpath('//div/a[contains(@title, "Profile")][1]/@href') councillors = mayor + page.xpath( '//td//a[contains(@title, "Profile")][1]/@href') assert len(councillors), 'No councillors found' for councillor in councillors: page = self.lxmlize(councillor) info = page.xpath('//table/tbody/tr/td[2]')[0] for br in info.xpath('*//br'): br.tail = '\n' + br.tail if br.tail else '\n' lines = [ line.strip() for line in info.text_content().split('\n') if line.strip() ] name = lines[0].replace('Councillor ', '').replace('Mayor ', '') if lines[1].endswith(' Ward'): district = lines[1].replace(' Ward', '') role = 'Councillor' elif 'At Large' in lines[1]: role = 'Councillor at Large' district = 'Thunder Bay (seat {})'.format( councillor_seat_number) councillor_seat_number += 1 else: district = 'Thunder Bay' role = 'Mayor' name = name.replace('Councillor', '').replace('At Large', '').replace('Mayor', '').strip() p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(councillor) p.image = page.xpath('//td[@valign="top"]/img/@src')[0] address = ', '.join(info.xpath('./p/text()')[0:2]).strip() address = re.sub(r'\s{2,}', ' ', address) p.add_contact('address', address, 'legislature') contacts = filter(None, (text.strip() for text in info.xpath('./p[2]/text()'))) for contact in contacts: contact_type, contact = contact.replace('Cel:', 'Cell:').split(':') contact = contact.replace('(1st)', '').replace('(2nd)', '').strip() if 'Fax' in contact_type: p.add_contact('fax', contact, 'legislature') elif 'Email' in contact_type: break else: p.add_contact('voice', contact, contact_type) email = self.get_email(info) p.add_contact('email', email) yield p
def scrape(self): response = urlopen(COUNCIL_PAGE).read() pdf = open('/tmp/sk.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output( ['pdftotext', '-layout', '/tmp/sk.pdf', '-']) data = data.splitlines(True) pages = [] page = [] for line in data: if line.strip( ) and 'Page' not in line and 'CITIES' not in line and 'NORTHERN TOWNS, VILLAGES' not in line: page.append(line) elif page: pages.append(page) page = [] districts = [] for page in pages: index = re.search(r'(\s{6,})', page[0]) if index: index = index.end() - 1 else: index = -1 dist1 = [] dist2 = [] for line in page: dist1.append(line[:index].strip()) dist2.append(line[index:].strip()) districts.append(dist1) districts.append(dist2) for district in districts: district_name = district.pop(0).split(',')[0].title() org = Organization( name=district_name + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) councillors = [] contacts = {} for i, line in enumerate(district): if 'Phone' in line: phone = line.split(':')[1].replace('(', '').replace( ') ', '-').strip() if phone: contacts['voice'] = phone if 'Fax' in line: fax = line.split(':')[1].replace('(', '').replace(') ', '-').strip() if fax: contacts['fax'] = fax if 'E-Mail' in line: email = line.split(':')[1].strip() if email: contacts['email'] = email if 'Address' in line and line.split(':')[1].strip(): address = line.split(':')[1].strip() + ', ' + ', '.join( district[i + 1:]).replace(' ,', '') contacts['address'] = address if 'Mayor' in line or 'Councillor' in line or 'Alderman' in line: councillor = line.split(':')[1].replace('Mr.', '').replace( 'Mrs.', '').replace('Ms.', '').replace('His Worship', '').replace('Her Worship', '').strip() role = line.split(':')[0].strip() if councillor: councillors.append([councillor, role]) if not councillors: continue yield org for councillor in councillors: p = Person(primary_org='legislature', name=councillor[0], district=district_name) p.add_source(COUNCIL_PAGE) membership = p.add_membership(org, role=councillor[1], district=district_name) for key, value in contacts.items(): membership.add_contact_detail( key, value, '' if key == 'email' else 'legislature') yield p os.system('rm /tmp/sk.pdf')
def scrape_mayor(div, name): p = Person(primary_org='legislature', name=name, district='Wilmot', role='Mayor') p.add_source(COUNCIL_PAGE) info = div.xpath('./parent::p//text()') info.pop(0) address = ' '.join(info[:3]) phone = info[3].split()[1] fax = info[4].split()[1] email = info[-1] p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email) return p
def scrape(self): screen_names = json.loads( self.get('http://scrapers-ruby.herokuapp.com/twitter_users').text) page = self.lxmlize(COUNCIL_PAGE) rows = page.xpath('//div[@class="main-content"]//tr')[1:] assert len(rows), 'No members found' for row in rows: name_cell = row.xpath('./td[1]')[0] last_name = name_cell.xpath('.//span[1]//text()')[0] first_name = name_cell.xpath('.//span[2]//text()')[0] name = '{} {}'.format(first_name, last_name) constituency = row.xpath('./td[2]//text()')[0].replace( '–', '—') # n-dash, m-dash if constituency == 'Mont-Royal': constituency = 'Mount Royal' province = row.xpath('./td[3]//text()')[0] party = row.xpath('string(./td[4])') # allow string() url = name_cell.xpath('.//a/@href')[0] if province == 'Québec': url = url.replace('/en/', '/fr/') mp_page = self.lxmlize(url) email = self.get_email(mp_page, '//span[@class="caucus"]', error=False) photo = mp_page.xpath( '//div[@class="profile overview header"]//img/@src')[0] m = Person(primary_org='lower', name=name, district=constituency, role='MP', party=party) m.add_source(COUNCIL_PAGE) m.add_source(url) screen_name = screen_names.get(name) if screen_name: m.add_link('https://twitter.com/{}'.format(screen_name)) # @see http://www.parl.gc.ca/Parliamentarians/en/members/David-Yurdiga%2886260%29 if email: m.add_contact('email', email) elif name == 'Adam Vaughan': m.add_contact('email', '*****@*****.**') if photo: # Determine whether the photo is actually a generic silhouette photo_response = self.get(photo) if (photo_response.status_code == 200 and hashlib.sha1(photo_response.content).hexdigest() not in IMAGE_PLACEHOLDER_SHA1): m.image = photo personal_url = mp_page.xpath( '//a[contains(@title, "Personal Web Site")]/@href') if personal_url: m.add_link(personal_url[0]) if province == 'Québec': m.add_contact('address', 'Chambre des communes\nOttawa ON K1A 0A6', 'legislature') else: m.add_contact('address', 'House of Commons\nOttawa ON K1A 0A6', 'legislature') voice = mp_page.xpath( '//div[@class="hilloffice"]//span//text()[contains(., "Telephone:")]|//div[@class="hilloffice"]//span//text()[contains(., "Téléphone :")]' )[0].replace('Telephone: ', '').replace('Téléphone : ', '') if voice: m.add_contact('voice', voice, 'legislature') fax = mp_page.xpath( '//div[@class="hilloffice"]//span//text()[contains(., "Fax:")]|//div[@class="hilloffice"]//span//text()[contains(., "Télécopieur :")]' )[0].replace('Fax: ', '').replace('Télécopieur : ', '') if fax: m.add_contact('fax', fax, 'legislature') for i, li in enumerate( mp_page.xpath('//div[@class="constituencyoffices"]//li')): spans = li.xpath('./span[not(@class="spacer")]') note = 'constituency' if i: note += ' ({})'.format(i + 1) m.add_contact( 'address', '\n'.join([ spans[0].text_content(), # address line 1 spans[1].text_content(), # address line 2 spans[2].text_content(), # city, region spans[3].text_content(), # postal code ]), note) voice = li.xpath( './span//text()[contains(., "Telephone:")]|./span//text()[contains(., "Téléphone :")]' ) if voice: voice = voice[0].replace('Telephone: ', '').replace('Téléphone : ', '') if voice: m.add_contact('voice', voice, note) fax = li.xpath( './span//text()[contains(., "Fax:")]|./span//text()[contains(., "Télécopieur :")]' ) if fax: fax = fax[0].replace('Fax: ', '').replace('Télécopieur : ', '') if fax: m.add_contact('fax', fax, note) yield m
def scrape_mayor(self): page = self.lxmlize(MAYOR_PAGE) image = page.xpath('//img[contains(@alt, "Mayor")]/@src')[0] contact_url = page.xpath( '//a[contains(text(), "Contact the Mayor")]/@href')[0] contact_page = self.lxmlize(contact_url) infos = contact_page.xpath( '//h4[contains(text(), "Address")]/following-sibling::p') name = ' '.join(infos[0].text_content().split('\n')[0].split()[2:]) address = ' '.join(infos[0].text_content().split('\n')[1:]) phone = infos[1].text_content().split('\n')[0].replace('Phone', '') fax = infos[1].text_content().split('\n')[1].replace('Fax', '') p = Person(primary_org='legislature', name=name, district='Saskatoon', role='Mayor') p.add_source(MAYOR_PAGE) p.add_source(contact_url) p.image = image p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') return p
def scrape(self): seat_numbers = defaultdict(int) page = self.lxmlize(COUNCIL_PAGE) yield self.scrape_mayor() councillors = page.xpath('//div[@id="centre_content"]//tr') for councillor in councillors: if 'Position' in councillor.text_content(): continue ward = councillor.xpath('./td')[0].text_content().replace( 'Councillor', '') seat_numbers[ward] += 1 district = '{} (seat {})'.format(ward, seat_numbers[ward]) name = councillor.xpath('./td')[1].text_content() url = councillor.xpath('./td/a')[0].attrib['href'] p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) page = self.lxmlize(url) content = page.xpath('//div[@id="centre_content"]')[0] email = self.get_email(content) p.add_contact('email', email) p.add_contact('voice', self.get_phone(content, area_codes=[226, 519]), 'legislature') p.image = page.xpath( 'string(//div[@id="centre_content"]//img/@src)' ) # can be empty if len(page.xpath('//div[@id="centre_content"]//a')) > 2: p.add_link( page.xpath('//div[@id="centre_content"]//a') [-1].attrib['href']) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[./h2/a[contains(@href, "/District")]]') for councillor in councillors: district = re.sub( r' ?[–—-] ?', '—', '—'.join( filter(None, (text.replace(',', '').strip() for text in councillor.xpath('./p/text()'))))) name_elem = councillor.xpath('./p/strong/text()')[0] if 'Councillor' in name_elem: name = name_elem.strip()[len('Councillor '):] else: name = name_elem if name != 'To be determined': photo = councillor.xpath('./p/a/img/@src')[0] councillor_page = self.lxmlize( councillor.xpath('./h2/a/@href')[0]) contact_page_url = councillor_page.xpath( '//li/a[contains(@href, "contact")]/@href')[0] contact_page = self.lxmlize(contact_page_url) contact_node = contact_page.xpath( '//div[./h1[contains(text(), "Contact")]]')[0] phone = self.get_phone(contact_node, area_codes=[902]) email = self.get_email(contact_node) p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(contact_page_url) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email) p.image = photo yield p mayor_page = self.lxmlize(MAYOR_PAGE, 'iso-8859-1') name = ' '.join(mayor_page.xpath( '//h2[contains(., "Bio")]/text()')).strip()[:-len(' Bio')] contact_page = self.lxmlize(MAYOR_CONTACT_URL, 'iso-8859-1') email = self.get_email(contact_page) p = Person(primary_org='legislature', name=name, district='Halifax', role='Mayor') p.add_source(MAYOR_PAGE) p.add_source(MAYOR_CONTACT_URL) p.add_contact('email', email) yield p
def scrape(self): index = self.lxmlize(MEMBER_INDEX_URL) csv_text = self.get(COUNCIL_PAGE).text csv_text = '\n'.join(csv_text.split('\n')[3:]) # discard first 3 rows reader = csv.reader(StringIO(csv_text)) # make unique field names for the two sets of address fields field_names = next(reader) for name in OFFICE_FIELDS: assert(field_names.count(name) == 2) field_names[field_names.index(name)] = '{} 1'.format(name) field_names[field_names.index(name)] = '{} 2'.format(name) rows = [dict(zip(field_names, row)) for row in reader] assert len(rows), 'No members found' for mla in rows: name = '{} {} {}'.format( mla['MLA First Name'], mla['MLA Middle Names'], mla['MLA Last Name'], ) if name.strip() == '': continue party = get_party(mla['Caucus']) name_without_status = name.split(',')[0] row_xpath = '//td[normalize-space()="{}"]/..'.format( mla['Constituency Name'], ) detail_url, = index.xpath('{}//a/@href'.format(row_xpath)) photo_url, = index.xpath('{}//img/@src'.format(row_xpath)) p = Person( primary_org='legislature', name=name_without_status, district=mla['Constituency Name'], role='MLA', party=party, image=photo_url, ) p.add_source(COUNCIL_PAGE) p.add_source(detail_url) if mla['Email']: p.add_contact('email', mla['Email']) elif mla.get('MLA Email'): p.add_contact('email', mla['MLA Email']) assert(mla['Address Type 1'] == 'Legislature Office') assert(mla['Address Type 2'] == 'Constituency Office') for suffix, note in ((1, 'legislature'), (2, 'constituency')): for key, contact_type in (('Phone', 'voice'), ('Fax', 'fax')): value = mla['{} Number {}'.format(key, suffix)] if value and value != 'Pending': p.add_contact(contact_type, value, note) address = ', '.join( filter( bool, [ mla[ '{} {}'.format(field, suffix) ] for field in ADDRESS_FIELDS ] ) ) if address: p.add_contact('address', address, note) yield p
def scrape(self): parties = { 'BC NDP': 'New Democratic Party of British Columbia', 'BC Liberal Party': 'British Columbia Liberal Party', } page = self.lxmlize(COUNCIL_PAGE, xml=True) nsmap = {'d': 'http://schemas.microsoft.com/ado/2007/08/dataservices'} members = page.xpath('//d:Cells', namespaces=nsmap) assert len(members), 'No members found' for member in members: url = member.xpath( './d:element/d:Key[text()="Path"]/following-sibling::d:Value/text()', namespaces=nsmap)[0] page = self.lxmlize(url) name = page.xpath( '//div[contains(@class, "BCLASS-pagetitle")]//h3/text()' )[0].replace('Wm.', '').replace(', Q.C.', '').strip() district, party = cleanup_list( page.xpath( '//div[@id="MinisterTitle"]/following-sibling::text()')) party = parties.get(party, party) p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=party) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('//img[contains(@src, "Members")]/@src')[0] email = page.xpath( '//div[@class="convertToEmail"]//text()')[0].strip() if '#' in email: email = email.split('#')[0] if email: p.add_contact('email', email) office = ', '.join( cleanup_list( page.xpath( '//h4[contains(text(), "Office:")]/ancestor::div/text()' ))) office = re.sub(r'\s{2,}', ' ', office) p.add_contact('address', office, 'legislature') constituency = ', '.join( cleanup_list( page.xpath( '//h4[contains(text(), "Constituency:")]/ancestor::div[1]//text()' ))) constituency = re.sub(r'\s{2,}', ' ', constituency).split(', Phone')[0] p.add_contact('address', constituency, 'constituency') phones = cleanup_list( page.xpath( '//span[contains(text(), "Phone:")]/following-sibling::text()' )) office_phone = phones[0] p.add_contact('voice', office_phone, 'legislature') if len(phones) > 1: constituency_phone = phones[1] p.add_contact('voice', constituency_phone, 'constituency') yield p
def scrape(self): exclude_divisions = { 'ocd-division/country:ca/csd:1301006', # Saint John 'ocd-division/country:ca/csd:1307022', # Moncton 'ocd-division/country:ca/csd:1310032', # Fredericton } expected_roles = { 'Mayor', 'Councillor', } unique_roles = { 'Mayor', } classifications = { 'Cities': 'City', 'Towns': 'Town', 'Villages': 'Village', 'Rural Communities': 'Community', 'Regional Municipality': 'Regional', } corrections = { 'Beaubassin-est/East': 'Beaubassin East', 'Lac-Baker': 'Lac Baker', 'Saint-François-de-Madawaska': 'Saint-François de Madawaska', 'Saint-Hilaire': 'Saint Hilaire', } unknown_names = { 'Haut-Madawaska', # incorporated after Census 2016 } duplicate_names = { 'Denis Savoie', 'Josée Levesque', 'Luc Levesque', } names_to_ids = {} for division in Division.get('ocd-division/country:ca').children( 'csd'): type_id = division.id.rsplit(':', 1)[1] if type_id.startswith('13'): if division.attrs['classification'] == 'P': continue if division.name in names_to_ids: raise Exception('unhandled collision: {}'.format( division.name)) else: names_to_ids[division.name] = division.id page = self.lxmlize(COUNCIL_PAGE) list_links = page.xpath( '//div[@id="sidebar"]//div[contains(@class, "list")][1]//a') birth_date = 1900 seen = set() assert len(list_links), 'No list items found' for list_link in list_links: page = self.lxmlize(list_link.attrib['href']) detail_urls = page.xpath('//td[1]//@href') assert len(detail_urls), 'No municipalities found' for detail_url in detail_urls: page = self.lxmlize(detail_url, encoding='utf-8') division_name = re.sub( r'\ASt\b\.?', 'Saint', page.xpath('//h1/text()')[0].split(' - ', 1)[1]) division_name = corrections.get(division_name, division_name) if division_name in unknown_names: continue division_id = names_to_ids[division_name] if division_id in exclude_divisions: continue if division_id in seen: raise Exception( 'unhandled collision: {}'.format(division_id)) seen.add(division_id) division_name = Division.get(division_id).name organization_name = '{} {} Council'.format( division_name, classifications[list_link.text]) organization = Organization(name=organization_name, classification='government') organization.add_source(detail_url) address = ', '.join( page.xpath('//div[@class="left_contents"]/p[1]/text()')) contacts = page.xpath( '//div[@class="left_contents"]/p[contains(., "Contact")]/text()' ) phone = contacts[0].split(':')[1] if len(contacts) > 1: fax = contacts[1].split(':')[1] email = self.get_email(page, '//div[@class="left_contents"]', error=False) url = page.xpath( '//div[@class="left_contents"]//@href[not(contains(., "mailto:"))]' ) if url: url = url[0] groups = page.xpath( '//div[contains(@class, "right_contents")]/p') assert len(groups), 'No groups found' for p in groups: role = p.xpath('./b/text()')[0].rstrip('s') if role not in expected_roles: raise Exception('unexpected role: {}'.format(role)) councillors = p.xpath('./text()') assert len(councillors), 'No councillors found' for seat_number, name in enumerate(councillors, 1): if 'vacant' in name.lower(): continue if role in unique_roles: district = division_name else: district = '{} (seat {})'.format( division_name, seat_number) organization.add_post(role=role, label=district, division_id=division_id) p = Person(primary_org='government', primary_org_name=organization_name, name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(list_link.attrib['href']) p.add_source(detail_url) if name in duplicate_names: p.birth_date = str(birth_date) birth_date += 1 p.add_contact('address', address, 'legislature') # @see https://en.wikipedia.org/wiki/Area_code_506 if phone: p.add_contact('voice', phone, 'legislature', area_code=506) if fax: p.add_contact('fax', fax, 'legislature', area_code=506) if email: p.add_contact('email', email) if url: p.add_link(url) p._related[0].extras[ 'boundary_url'] = '/boundaries/census-subdivisions/{}/'.format( division_id.rsplit(':', 1)[1]) yield p yield organization
def scrape_mayor(self, url): page = self.lxmlize(url) text = page.xpath('//h1//text()[contains(., "Mayor")]')[0] if 'Acting Mayor' in text: # A councillor is acting mayor. We would need to add two roles to # the same person, which can be done with a little effort. return name = re.sub('(?:Acting )?Mayor ', '', text) email = self.get_email(page) phone = self.get_phone(page.xpath('//table[1]')[0]) p = Person(primary_org='legislature', name=name, district='Côte-Saint-Luc', role='Maire') p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('.//div[@class="content"]//img/@src')[0] p.add_source(url) p.add_contact('email', email) p.add_contact('voice', phone, 'legislature') return p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) # it's all javascript rendered on the client... wow. js = page.xpath('string(//div[@class="inner_container"]/div/script[2])' ) # allow string() districts = re.findall(r'arrayDistricts\[a.+"(.+)"', js) names = re.findall(r'arrayMembres\[a.+"(.+)"', js) urls = re.findall(r'arrayLiens\[a.+"(.+)"', js) # first item in list is mayor p = Person(primary_org='legislature', name=names[0], district='Gatineau', role='Maire') p.add_source(COUNCIL_PAGE) p.add_source(MAYOR_CONTACT_PAGE) email = '*****@*****.**' # hardcoded p.add_contact('email', email) yield p for raw_district, name, url in list(zip(districts, names, urls))[1:]: if name == 'Vacant': continue profile_url = COUNCIL_PAGE + '/' + url.split('/')[-1] profile_page = self.lxmlize(profile_url) photo_url = profile_page.xpath('//img/@src')[0] district = 'District ' + re.search('\d+', raw_district).group(0) email = self.get_email(profile_page) p = Person(primary_org='legislature', name=name, district=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(profile_url) p.image = photo_url p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) table_data = page.xpath('//div[@id="litcontentDiv"]//tr') council_data = table_data[2:-1] mayor_row = table_data[0] photo_url_rel = mayor_row.xpath('string(.//img/@src)') # can be empty photo_url = urljoin(COUNCIL_PAGE, photo_url_rel) contact_node = mayor_row.xpath('./td')[1] name = contact_node.xpath('.//font[1]/text()')[0] email = self.get_email(contact_node) p = Person(primary_org='legislature', name=name, district='Sault Ste. Marie', role='Mayor') p.add_source(COUNCIL_PAGE) p.add_contact('email', email) p.image = photo_url yield p # alternate between a row represneting a ward name and councilors for ward_row, data_row in zip(*[iter(council_data)] * 2): district = ward_row.xpath('.//text()[contains(., "Ward")]')[0] district_num = district_name_using_number(district) for councillor_node in data_row.xpath('./td'): name = councillor_node.xpath( './/strong/text()|.//font[1]/text()')[0] email = self.get_email(councillor_node) photo_url_rel = councillor_node.xpath( 'string(.//img/@src)') # can be empty photo_url = urljoin(COUNCIL_PAGE, photo_url_rel) # address and phone are brittle, inconsistent p = Person(primary_org='legislature', name=name, district=district_num, role='Councillor') p.add_source(COUNCIL_PAGE) if email: p.add_contact('email', email) p.image = photo_url yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) node = page.xpath('//div[@class="content-field"]/h3[contains(./text(), "Mayor")]/following-sibling::p[2]')[0] name = node.xpath('./strong/text()')[0] phone = node.xpath('./text()')[2].split(': ')[1] fax = node.xpath('./text()')[3].split(': ')[1] email = node.xpath('./a/text()')[0] image = node.xpath('./preceding::p//img/@src')[0] p = Person(primary_org='legislature', name=name, district='Belleville', role='Mayor') p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email) p.image = image yield p wards = page.xpath('//h3[contains(text(), "Councillors")]') assert len(wards), 'No councillors found' for ward in wards: ward_name = re.search(r'(Ward.+) Councillors', ward.text).group(1) councillors = ward.xpath('./following-sibling::div[1]//strong') for councillor in councillors: self.seat_numbers[ward_name] += 1 district = '{} (seat {})'.format(ward_name, self.seat_numbers[ward_name]) role = 'Councillor' name = councillor.text_content() phone = councillor.xpath('./following-sibling::text()[2]')[0].split(':')[1] email = councillor.xpath('./following-sibling::a//text()')[0] image = councillor.xpath('./preceding::img[1]/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email) p.image = image yield p
def scrape(self): response = urlopen(COUNCIL_PAGE).read() pdf = open('/tmp/yt.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output( ['pdftotext', '-layout', '/tmp/yt.pdf', '-']) data = re.split(r'\n\s*\n', data) for municipality in data: if 'Councillors' not in municipality: continue lines = municipality.split('\n') if 'Page' in lines[0]: lines.pop(0) if not lines[0].strip(): lines.pop(0) col1end = re.search(r'\s{2,}(\w)', lines[0].strip()).end() col2end = re.search(r':\s{2,}(\w)', lines[0].strip()).end() if 'Council' in lines[1]: address = lines[2][:col1end - 1].strip() + ' ' + lines[3][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() + ' ' + lines[1][:col1end - 1].strip() else: address = lines[1][:col1end - 1].strip() + ' ' + lines[2][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() organization = Organization( name=district + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) organization.add_source(COUNCIL_PAGE) yield organization phone = re.findall(r'(?<=Phone: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') email = re.findall(r'(?<=E-mail:) (\S*)', municipality)[0] fax = None if 'Fax' in municipality: fax = re.findall(r'(?<=Fax: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') website = None if 'Website' in municipality: website = re.findall(r'((http:\/\/|www.)(\S*))', municipality)[0][0] councillor_or_mayor = False for line in lines: if 'Mayor:' in line: councillor_or_mayor = True role = 'Mayor' continue if 'Councillors' in line: councillor_or_mayor = True role = 'Councillor' continue if councillor_or_mayor: councillor = line[col1end - 1:col2end - 1].strip() if not councillor: continue p = Person(primary_org='legislature', name=councillor, district=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(organization, role=role, district=district) membership.add_contact_detail('address', address, 'legislature') membership.add_contact_detail('voice', phone, 'legislature') membership.add_contact_detail('email', email) if fax: membership.add_contact_detail('fax', fax, 'legislature') if website: p.add_link(website) yield p os.system('rm /tmp/yt.pdf')
def scrape(self): regional_councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="WebPartWPQ3"]//ul[@class="dfwp-list"][1]/li/div/div/a') assert len(councillors), 'No councillors found' for councillor in councillors: url = councillor.attrib['href'] page = self.lxmlize(url) title = page.xpath('//div[@class="PL_Title"]')[0].text_content() if "Councillor" in title: district, name = re.split(r'Councillor', title) role = 'Councillor' if "Regional" in district: role = 'Regional Councillor' district = "Vaughan (seat {})".format(regional_councillor_seat_number) regional_councillor_seat_number += 1 else: name = re.search(r'Mayor ([^,]+)', page.xpath('//meta[@name="keywords"]/@content')[0]).group(1) district = 'Vaughan' role = 'Mayor' name = name.strip() if role == 'Mayor': detail = self.lxmlize(page.xpath('//a[contains(@href,"/Contact-the-Mayor")]/@href')[0]) contact_info = detail.xpath('//div[@id="ctl00_PlaceHolderMain_RichHtmlField1__ControlWrapper_RichHtmlField"]')[0] else: contact_node = page.xpath('//div[@id="WebPartWPQ2"][contains(., "Phone")]') if contact_node: contact_info = contact_node[0] else: contact_info = page.xpath('//div[@id="WebPartWPQ3"]')[0] phone = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4} ext\. [0-9]{4}', contact_info.text_content())[0].replace('ext. ', 'x') fax = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4}', contact_info.text_content())[1] email = self.get_email(contact_info) p = Person(primary_org='legislature', name=name, district=district.strip(), role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email) image = page.xpath('//img[contains(@alt, "Councillor")]/@src') if image: p.image = image[0] if page.xpath('.//a[contains(@href,"facebook")]'): p.add_link(page.xpath('.//a[contains(@href,"facebook")]')[0].attrib['href']) if page.xpath('.//a[contains(@href,"twitter")]'): p.add_link(page.xpath('.//a[contains(@href,"twitter")]')[0].attrib['href']) if page.xpath('.//a[contains(@href,"youtube")]'): p.add_link(page.xpath('.//a[contains(@href, "youtube")]')[0].attrib['href']) yield p
def scrape(self): csv_text = self.get(self.get_csv_url()).text rows = [row for row in csv.DictReader(StringIO(csv_text))] assert len(rows), 'No members found' for mla in rows: name = '{} {} {}'.format(mla['MLA First Name'], mla['MLA Middle Names'], mla['MLA Last Name']) if name.strip() == '': continue party = get_party(mla['Caucus']) name_without_status = name.split(',')[0] detail_url = ('http://www.assembly.ab.ca/net/index.aspx?' 'p=mla_contact&rnumber={0}&leg=29'.format( mla['Riding Number'])) detail_page = self.lxmlize(detail_url) photo_url = detail_page.xpath('//img[@class="MemPhoto"]/@src')[0] p = Person( primary_org='legislature', name=name_without_status, district=mla['Riding Name'], role='MLA', party=party, image=photo_url, ) p.add_source(COUNCIL_PAGE) p.add_source(detail_url) if mla['Email']: p.add_contact('email', mla['Email']) elif mla.get('MLA Email'): p.add_contact('email', mla['MLA Email']) if mla['Phone Number']: p.add_contact('voice', mla['Phone Number'], 'legislature') yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'utf-8') councillors = page.xpath('//div[@class="member-box member-box--gray"]') assert len(councillors), 'No councillors found' for councillor_elem in councillors: name = councillor_elem.xpath( './/div[@class="fiche__name"]/text()')[0] district = councillor_elem.xpath( './/div[@class="fiche__category"]/text()')[0] phone = councillor_elem.xpath( './/div[@class="fiche__social"]/span/text()')[0].split('T')[1] email_mailto = councillor_elem.xpath( './/div[@class="fiche__social"]/a[contains(@href, "mailto")]/@href' ) photo_url = councillor_elem.xpath('.//img')[0].attrib['src'] p = Person(primary_org='legislature', name=name, district=district, role='Conseiller', image=photo_url) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') if email_mailto: email = email_mailto[0].split('mailto:')[1] p.add_contact('email', email) yield p mayor_elem = page.xpath( '//div[@class="member-box member-box--main"]')[0] name = mayor_elem.xpath('.//div[@class="fiche__name"]/text()')[0] phone = mayor_elem.xpath( './/div[@class="fiche__social"]/span/text()')[0].split('T')[1] email_mailto = mayor_elem.xpath( './/div[@class="fiche__social"]/a[contains(@href, "mailto")]/@href' ) photo_url = councillor_elem.xpath('.//img')[0].attrib['src'] p = Person(primary_org='legislature', name=name, district='Terrebonne', role='Maire', image=photo_url) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') if email_mailto: email = email_mailto[0].split('mailto:')[1] p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) members = page.xpath('//*[@id="ListeDeputes"]/tbody/tr') assert len(members), 'No members found' for row in members: name_comma, division = [cell.text_content() for cell in row[:2]] name = ' '.join(reversed(name_comma.strip().split(','))) party = row[2].text_content() email = self.get_email(row[3], error=False) detail_url = row[0][0].attrib['href'] detail_page = self.lxmlize(detail_url) photo_url = detail_page.xpath('//img[@class="photoDepute"]/@src')[0] division = division.replace('–', '—') # n-dash, m-dash p = Person(primary_org='legislature', name=name, district=division, role='MNA', party=party, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_source(detail_url) if email: p.add_contact('email', email) contact_url = detail_url.replace('index.html', 'coordonnees.html') contact_page = self.lxmlize(contact_url) p.add_source(contact_url, note='For telephone number(s)') for div in contact_page.xpath('//div[@class="blockAdresseDepute"]'): try: phone = self.get_phone(div) heading = div.find('h3').text except Exception: pass # probably just no phone number present else: try: note = { 'Circonscription': 'constituency', 'Parlement': 'legislature', 'Ministère': 'legislature', }[heading] except KeyError: raise # scraper should be updated to handle new value else: p.add_contact('voice', phone, note) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table[@id="Main Content"]//td[@colspan="3"]//td/p/b') for councillor in councillors: district, name = councillor.xpath('./text()')[0].split(':') if 'Mayor' in district: yield scrape_mayor(councillor, name) continue p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) base_info = councillor.xpath('./parent::p/text()') for info in councillor.xpath('./parent::p/following-sibling::p'): if info.xpath('.//b'): break base_info = base_info + info.xpath('./text()') address = '' complete = False while not complete: address = address + ' ' + base_info.pop(0) if re.search(r'[A-Z][0-9A-Z][A-Z] \d[A-Z]\d', address): complete = True p.add_contact('address', address, 'legislature') base_info.pop(-1) base_info = ' '.join(base_info).split() for i, contact in enumerate(base_info): if re.match(r'[0-9]', contact): continue if 'fax' in contact: p.add_contact('fax', base_info[i + 1], 'legislature') else: p.add_contact(contact, base_info[i + 1], contact) email = self.get_email(councillor, './parent::p/following-sibling::p') p.add_contact('email', email) yield p
def scrape(self): root = self.lxmlize(COUNCIL_PAGE) everyone = root.xpath('//span[@class="Title"]') mayornode = everyone[0] mayor = {} spantext = ' '.join(mayornode.xpath('.//text()')) mayor['name'] = re.search(r'[^(]+', spantext).group(0).strip() mayor['photo_url'] = urljoin(COUNCIL_PAGE, mayornode.xpath('img/@src')[0]) mayor['email'] = mayornode.xpath('following::a[1]/text()')[0] m = Person(primary_org='legislature', name=mayor['name'], district='Charlottetown', role='Mayor') m.add_source(COUNCIL_PAGE) m.add_contact('email', mayor['email']) m.image = mayor['photo_url'] yield m councillors = root.xpath('//span[@class="Title"]')[1:] assert len(councillors), 'No councillors found' for span in councillors: spantext = ' '.join(span.xpath('.//text()')) header = spantext.replace('\u2013', '-').replace('\x96', '-').split('-') if len(header) != 2: continue name = header[0].strip() name = name.replace('Councillor', '') name = re.sub(r'\(.+?\)', '', name) name = ' '.join(name.split()) district_id = ' '.join(header[1].split()[:2]) # needed a wacky xpath to deal with ward 8 photo = span.xpath('preceding::hr[1]/following::img[1]/@src') photo_url = urljoin(COUNCIL_PAGE, photo[0]) email = span.xpath( 'string(following::a[1]/text())') # can be empty p = Person(primary_org='legislature', name=name, district=district_id, role='Councillor') p.add_source(COUNCIL_PAGE) if email: p.add_contact('email', email) p.image = photo_url yield p
def scrape(self): regional_councillor_seat_number = 1 yield self.mayor_info(MAYOR_PAGE) page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="news"]//p') for councillor in councillors: district = councillor.xpath('./b')[0].text_content() district = re.findall('(?:W|R).*', district)[0] role = 'Councillor' if 'Regional' in district: role = 'Regional Councillor' district = 'Cambridge (seat {})'.format( regional_councillor_seat_number) regional_councillor_seat_number += 1 name = councillor.xpath('.//a')[0].text_content() url = councillor.xpath('.//a')[0].attrib['href'] page = self.lxmlize(url) image = page.xpath( '//img[contains(@src, "councilImages")]/@src')[0] address = page.xpath('//*[contains(text(),"Address")]/ancestor::td' )[-1].text_content().split(':')[-1].replace( "\t", '') phone = page.xpath('//*[contains(text(),"Tel")]/ancestor::td' )[-1].text_content().split(':')[-1].replace( "\t", '') phone = phone.replace('(', '').replace(') ', '-') if page.xpath('//*[contains(text(),"Fax")]'): fax = page.xpath('//*[contains(text(),"Fax")]/ancestor::td' )[-1].text_content().split(':')[-1].replace( "\t", '') fax = fax.replace('(', '').replace(') ', '-') email = self.get_email(page) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email) p.image = image yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath( '//div[@id="ctl00_ContentPlaceHolder1_ContentBlock1"]//a/parent::p' ) for councillor in councillors: if not councillor.text_content().strip(): continue if 'Mayor' in councillor.text_content(): name = councillor.text_content().replace('Mayor ', '') district = 'Haldimand County' role = 'Mayor' else: district, name = councillor.text_content().split(' - ') name = name.replace('Councillor', '').strip() district = district.strip() role = 'Councillor' url = councillor.xpath('.//a')[0].attrib['href'] page = self.lxmlize(url) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath( '//div[@id="ctl00_ContentPlaceHolder1_ContentBlock1"]//tr[1]/td//img/@src' )[0] info = page.xpath( '//a[contains(@href, "mailto:")]/parent::*/text()') for i, field, in enumerate(info): if re.match(r'[0-9]+ [A-Z]', field): address = field + ', ' + info[i + 1] + ', ' + info[i + 2] p.add_contact('address', address, 'legislature') if re.findall(r'[0-9]{3} [0-9]{3} [0-9]{4}', field): if 'Fax' in field: num = field.replace('Fax: ', '').strip().replace(' ', '-') p.add_contact('fax', num, 'legislature') else: num = field.replace('Telephone: ', '').strip().replace(' ', '-') p.add_contact('voice', num, 'legislature') email = self.get_email(page) p.add_contact('email', email) yield p
def scrape(self): seat_numbers = defaultdict(int) page = self.lxmlize(COUNCIL_PAGE) mayor_url = page.xpath('//li[@id="pageid1075"]/div/a/@href')[0] yield self.scrape_mayor(mayor_url) wards = page.xpath('//div[@id="content"]//h3') for ward in wards: ward_name = ward.text_content() councillor_links = ward.xpath('./following-sibling::p[1]/a') assert len(councillor_links ), 'No councillors found for ward {}'.format(ward_name) for councillor_link in councillor_links: name = councillor_link.text if ward_name in ('Ward 1', 'Ward 2'): seat_numbers[ward_name] += 1 district = '{} (seat {})'.format(ward_name, seat_numbers[ward_name]) else: district = ward_name p = Person(primary_org='legislature', name=name, district=district, role='Councillor') url = councillor_link.attrib['href'] p.add_source(COUNCIL_PAGE) p.add_source(url) cpage = self.lxmlize(url) image_url_rel = cpage.xpath( '//div[@id="content"]//img[contains(@alt, "Councillor")]/@src' )[0] image_url = urljoin(url, image_url_rel) p.image = image_url contacts = page.xpath( '//div[@id="content"]//div[@class="block"]/text()') for contact in contacts: if not re.search(r'[0-9]', contact): continue if '(' not in contact: contact_type = 'T' else: contact_type, contact = contact.split('(') contact = contact.replace(') ', '-').strip() if 'T' in contact_type: p.add_contact('voice', contact, 'legislature') if 'H' in contact_type: p.add_contact('voice', contact, 'residence') if 'C' in contact_type: p.add_contact('cell', contact, 'legislature') if 'F' in contact_type: p.add_contact('fax', contact, 'legislature') email = self.get_email( cpage, '//div[@id="content"]//div[@class="block"]') p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath( '//table[@cellpadding="4"]//td//a[text()!=""]/@href') for councillor in councillors: page = self.lxmlize(councillor) # Hon. is followed by Dr. in one case but the clean_name function # removes only one honorific title name = page.xpath( '//h2[contains(text(), "MLA:")]')[0].text_content().replace( 'MLA:', '').replace('Dr.', '').replace(', Q.C.', '').replace('Wm.', '').strip() district, party = cleanup_list( page.xpath( '//h2/following-sibling::div[1]/div[2]/div[1]/div/text()')) p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=party) p.add_source(COUNCIL_PAGE) p.add_source(councillor) p.image = page.xpath('//img[contains(@src, "Members")]/@src')[0] email = page.xpath( '//span[@class="convertToEmail"]//text()')[0].strip() if '#' in email: email = email.split('#')[0] if email: p.add_contact('email', email) office = ', '.join( cleanup_list( page.xpath( '//h4[contains(text(), "Office:")]/ancestor::div/text()' ))) office = re.sub(r'\s{2,}', ' ', office) p.add_contact('address', office, 'legislature') constituency = ', '.join( cleanup_list( page.xpath( '//h4[contains(text(), "Constituency:")]/ancestor::div[1]//text()' ))) constituency = re.sub(r'\s{2,}', ' ', constituency).split(', Phone')[0] p.add_contact('address', constituency, 'constituency') phones = cleanup_list( page.xpath( '//span[contains(text(), "Phone:")]/following-sibling::text()' )) faxes = cleanup_list( page.xpath( '//span[contains(text(), "Fax:")]/following-sibling::span[1]/text()' )) office_phone = phones[0] p.add_contact('voice', office_phone, 'legislature') if len(phones) > 1: constituency_phone = phones[1] p.add_contact('voice', constituency_phone, 'constituency') office_fax = faxes[0] p.add_contact('fax', office_fax, 'legislature') if len(faxes) > 1: constituency_fax = faxes[1] p.add_contact('fax', constituency_fax, 'constituency') yield p
def scrape_mayor(self, url): page = self.lxmlize(url) name = page.xpath('//meta[@name="description"]/@content')[0].split( ',')[1] p = Person(primary_org='legislature', name=name, district='Moncton', role='Mayor') p.add_source(url) p.image = page.xpath('//div[@id="content"]/p[1]/img/@src')[0] info = page.xpath('//table[@class="whiteroundedbox"]//tr[2]/td[1]')[1] address = ', '.join(info.xpath('./p[1]/text()')[1:4]) address = re.sub(r'\s{2,}', ' ', address).strip() phone = info.xpath('.//p[2]/text()')[0].split(':')[1].strip() fax = info.xpath('.//p[2]/text()')[1].split(':')[1].strip() email = self.get_email(info) p.add_contact('address', address, 'legislature') if len(re.sub(r'\D', '', phone)) == 7: phone = '506-{}'.format(phone) p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email) return p
def scrape(self): # https://winnipeg.ca/council/wards/includes/wards.js # var COUNCIL_API = 'https://data.winnipeg.ca/resource/r4tk-7dip.json'; api_url = 'https://data.winnipeg.ca/resource/r4tk-7dip.json' data = json.loads(requests.get(api_url).content) page = self.lxmlize(COUNCIL_PAGE, 'utf-8') councillors = page.xpath('//div[@class="box"]') assert len(councillors), 'No councillors found' for councillor in councillors: role = councillor.xpath( './/div[@class="insideboxtitle"]/text()')[0].strip() name = councillor.xpath('.//p[@class="insideboxtext"]/text()')[0] image = councillor.xpath('.//@src')[0] if 'Councillor' in name: role = 'Councillor' name = name.replace('Councillor ', '') url = api_url item = next( (item for item in data if item['person'] == name and item['current_council']), None) if item is None: raise Exception(name) district = item['name_english'].replace(' - ', '—') # hyphen, m-dash email = item['email_link'] voice = item['phone'] fax = item['fax'] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) if not image.endswith('nophoto.jpg'): p.image = image p.add_contact('email', parse_email(email)) p.add_contact('voice', voice, 'legislature') p.add_contact('fax', fax, 'legislature') yield p
def scrape(self): exclude_divisions = {} exclude_districts = { 'Capital', 'Capital F', 'Capital G', 'Capital H', 'Central Coast B', 'Central Okanagan East', 'Central Okanagan West', 'Comox Valley B', 'Comox Valley C', 'Islands Trust', 'Kitimat-Stikine C', 'Kootenay Boundary B', 'Kootenay Boundary C', 'Kootenay Boundary D', 'Kootenay Boundary E', 'Metro Vancouver A', 'North Coast A', 'North Coast C', 'North Coast D', 'North Coast E', 'Okanagan-Similkameen I', 'Okanagan-Similkameen Olalla Local Community Commission', 'Qathet A', 'Qathet B', 'Qathet C', 'Qathet D', 'Qathet E', } expected_roles = { 'candidate', } infixes = { 'CY': 'City', 'DM': 'District', 'IGD': 'District', 'IM': 'Municipal', 'RGM': 'Regional', 'T': 'Town', 'VL': 'Village', 'RDA': 'District', } duplicate_names = { 'Rick Smith', 'Sung Y Wong', 'Elizabeth Taylor', } names_to_ids = {} for division in Division.get('ocd-division/country:ca').children( 'csd'): type_id = division.id.rsplit(':', 1)[1] if type_id.startswith('59'): if division.attrs['classification'] == 'IRI': continue if division.name in names_to_ids: names_to_ids[division.name] = None else: names_to_ids[division.name] = division.id reader = self.csv_reader(COUNCIL_PAGE, header=True) reader.fieldnames = [field.lower() for field in reader.fieldnames] organizations = {} birth_date = 1900 seen = set() for row in reader: name = row['full name'] district_name = row['district name'] if not any(row.values()) or name.lower() in ( '', 'vacant') or district_name in exclude_districts: continue if row['district id']: division_id = 'ocd-division/country:ca/csd:{}'.format( row['district id']) else: division_id = names_to_ids[row['district name']] if division_id in exclude_divisions: continue if not division_id: raise Exception('unhandled collision: {}'.format( row['district name'])) division = Division.get(division_id) division_name = division.name organization_name = '{} {} Council'.format( division_name, infixes[division.attrs['classification']]) if division_id not in seen: seen.add(division_id) organizations[division_id] = Organization( name=organization_name, classification='government') organizations[division_id].add_source(COUNCIL_PAGE) organization = organizations[division_id] role = row['primary role'] if role not in expected_roles: raise Exception('unexpected role: {}'.format(role)) if row['district id']: district = format(division_id) else: district = division_name organization.add_post(role=role, label=district, division_id=division_id) p = Person(primary_org='government', primary_org_name=organization_name, name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) if row['source url']: p.add_source(row['source url']) if name in duplicate_names: p.birth_date = str(birth_date) birth_date += 1 if row['email']: p.add_contact('email', row['email']) if row['phone']: p.add_contact('voice', row['phone'], 'legislature') if row['twitter']: p.add_link(row['twitter']) p._related[0].extras[ 'boundary_url'] = '/boundaries/census-subdivisions/{}/'.format( division_id.rsplit(':', 1)[1]) yield p for organization in organizations.values(): yield organization