def scrape(self): mayor_page = self.lxmlize(MAYOR_PAGE) contact_page = self.lxmlize(CONTACT_PAGE) name = mayor_page.xpath('//span/text()[contains(., "maire")]')[0].split(', ', 1)[0] p = Person(primary_org='legislature', name=name, district='Saguenay', role='Maire') p.add_source(MAYOR_PAGE) p.add_source(CONTACT_PAGE) node = contact_page.xpath('//h2[contains(., "Coordonnées du cabinet")]/following-sibling::p')[1] p.add_contact('voice', self.get_phone(node, area_codes=[418]), 'legislature') p.add_contact('email', self.get_email(node)) yield p page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[contains(./h3, "District")]') assert len(councillors), 'No councillors found' for councillor in councillors: district = councillor.xpath('./h3/text()')[0].replace('#', '') name = councillor.xpath('.//p/text()')[0] p = Person(primary_org='legislature', name=name, district=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_contact('voice', self.get_phone(councillor), 'legislature') p.add_contact('email', self.get_email(councillor)) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'utf-8') councillors = page.xpath('//div[@class="member-box member-box--gray"]') assert len(councillors), 'No councillors found' for councillor_elem in councillors: name = councillor_elem.xpath('.//div[@class="fiche__name"]/text()')[0] district = councillor_elem.xpath('.//div[@class="fiche__category"]/text()')[0] phone = councillor_elem.xpath('.//div[@class="fiche__social"]/span/text()')[0].split('T')[1] email_mailto = councillor_elem.xpath('.//div[@class="fiche__social"]/a[contains(@href, "mailto")]/@href') photo_url = councillor_elem.xpath('.//img')[0].attrib['src'] p = Person(primary_org='legislature', name=name, district=district, role='Conseiller', image=photo_url) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') if email_mailto: email = email_mailto[0].split('mailto:')[1] p.add_contact('email', email) yield p mayor_elem = page.xpath('//div[@class="member-box member-box--main"]')[0] name = mayor_elem.xpath('.//div[@class="fiche__name"]/text()')[0] phone = mayor_elem.xpath('.//div[@class="fiche__social"]/span/text()')[0].split('T')[1] email_mailto = mayor_elem.xpath('.//div[@class="fiche__social"]/a[contains(@href, "mailto")]/@href') photo_url = councillor_elem.xpath('.//img')[0].attrib['src'] p = Person(primary_org='legislature', name=name, district='Terrebonne', role='Maire', image=photo_url) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') if email_mailto: email = email_mailto[0].split('mailto:')[1] p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) mayor = page.xpath('//div[./div/h3[contains(text(), "Maire")]]/p/text()') m_name = mayor[0].strip().split('.')[1].strip() m_phone = mayor[1].strip().split(':')[1].strip() m = Person(primary_org='legislature', name=m_name, district='Saguenay', role='Maire') m.add_source(COUNCIL_PAGE) m.add_contact('voice', m_phone, 'legislature') yield m councillors = page.xpath('//div[./div/h3[contains(text(), "District")]]') for councillor in councillors: district = councillor.xpath('./div/h3')[0].text_content().replace('#', '') name = councillor.xpath('.//p/text()')[0].encode('latin-1').decode('utf-8') name = name.replace('M. ', '').replace('Mme ', '').strip() phone = councillor.xpath('.//p/text()')[1].split(':')[1].strip().replace(' ', '-') email = self.get_email(councillor) p = Person(primary_org='legislature', name=name, district=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath( '//div[@id = "block-districtdistrictindex"]/ul/li')[1:] assert len(councillors), 'No councillors found' for councillor in councillors: photo_div = councillor.xpath('./a/div[1]')[0] info_div = councillor.xpath('./a/div[2]')[0] district = re.sub(r'\s*[–—-]\s*', '—', '—'.join(info_div.xpath('./p/text()'))) # FIXME: we special-case one malformed district name. If you're editing this file, # try removing these lines if district.startswith("District 16 "): district = district[len("District 16 "):] name = info_div.xpath('./strong/p/text()')[0].replace( 'Councillor ', '').replace('Deputy Mayor ', '') if name != 'To be determined': photo = photo_div.xpath('.//img/@src')[0] url = councillor.xpath('./a/@href')[0] councillor_page = self.lxmlize(url) contact_node = councillor_page.xpath( '//div[@id = "block-districtdistrictprofile"]')[0] phone = self.get_phone(contact_node, area_codes=[902]) email = self.get_email(contact_node) p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email) p.image = photo yield p mayor_page = self.lxmlize(MAYOR_PAGE, 'iso-8859-1') name = ' '.join(mayor_page.xpath('//h1/text()')).replace('Mayor', '').strip() contact_div = mayor_page.xpath( '//aside[contains(@class, "layout-sidebar-second")]/section/div[1]' )[0] phone = self.get_phone(contact_div.xpath('./p[2]')[0]) email = self.get_email(contact_div.xpath('./p[2]')[0]) p = Person(primary_org='legislature', name=name, district='Halifax', role='Mayor') p.add_source(MAYOR_PAGE) p.add_contact('email', email) p.add_contact('voice', phone, 'legislature') yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[./h2/a[contains(@href, "/District")]]') for councillor in councillors: district = re.sub( r' ?[–—-] ?', '—', '—'.join( filter(None, (text.replace(',', '').strip() for text in councillor.xpath('./p/text()'))))) name_elem = councillor.xpath('./p/strong/text()')[0] if 'Councillor' in name_elem: name = name_elem.strip()[len('Councillor '):] else: name = name_elem if name != 'To be determined': photo = councillor.xpath('./p/a/img/@src')[0] councillor_page = self.lxmlize( councillor.xpath('./h2/a/@href')[0]) contact_page_url = councillor_page.xpath( '//li/a[contains(@href, "contact")]/@href')[0] contact_page = self.lxmlize(contact_page_url) contact_node = contact_page.xpath( '//div[./h1[contains(text(), "Contact")]]')[0] phone = self.get_phone(contact_node, area_codes=[902]) email = self.get_email(contact_node) p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(contact_page_url) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email) p.image = photo yield p mayor_page = self.lxmlize(MAYOR_PAGE, 'iso-8859-1') name = ' '.join(mayor_page.xpath( '//h2[contains(., "Bio")]/text()')).strip()[:-len(' Bio')] contact_page = self.lxmlize(MAYOR_CONTACT_URL, 'iso-8859-1') email = self.get_email(contact_page) p = Person(primary_org='legislature', name=name, district='Halifax', role='Mayor') p.add_source(MAYOR_PAGE) p.add_source(MAYOR_CONTACT_URL) p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) node = page.xpath('//td[@rowspan="2"]')[0] name = node.xpath('.//h3/strong/text()')[0] image = node.xpath('.//@src')[0] voice = self.get_phone(node) url = node.xpath('.//a[contains(., "Visit")]/@href')[0] p = Person(primary_org='legislature', name=name, district='Caledon', role='Mayor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('voice', voice, 'legislature') p.add_contact('email', self.get_email(self.lxmlize(url))) p.image = image yield p councillors = page.xpath('//div[@id="printAreaContent"]//table[2]//td') councillors = councillors[:12] + councillors[16:] assert len(councillors), 'No councillors found' for i in range(len(councillors) // 3): i = i // 4 * 12 + i % 4 district, role = councillors[i].xpath('.//h3/text()') name = councillors[i + 8].xpath('.//strong/text()')[0] voice = self.get_phone(councillors[i + 8]) url = councillors[i + 8].xpath('.//a[contains(., "Visit")]/@href')[0] if 'photo to come' in councillors[i + 4].text_content(): image = None else: image = councillors[i + 4].xpath('.//@src')[0] district = district.replace('\xa0', ' ') if ' and ' in district: district = district.replace('Ward ', 'Wards ') p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('voice', voice, 'legislature') p.add_contact('email', self.get_email(self.lxmlize(url))) if image: p.image = image yield p
def scrape(self): root = self.lxmlize(COUNCIL_PAGE) everyone = root.xpath('//span[@class="Title"]') mayornode = everyone[0] mayor = {} spantext = ' '.join(mayornode.xpath('.//text()')) mayor['name'] = re.search(r'[^(]+', spantext).group(0).strip() mayor['photo_url'] = urljoin(COUNCIL_PAGE, mayornode.xpath('img/@src')[0]) mayor['email'] = mayornode.xpath('following::a[1]/text()')[0] m = Person(primary_org='legislature', name=mayor['name'], district='Charlottetown', role='Mayor') m.add_source(COUNCIL_PAGE) m.add_contact('email', mayor['email']) m.image = mayor['photo_url'] yield m councillors = root.xpath('//span[@class="Title"]')[1:] assert len(councillors), 'No councillors found' for span in councillors: spantext = ' '.join(span.xpath('.//text()')) header = spantext.replace('\u2013', '-').replace('\x96', '-').split('-') if len(header) != 2: continue name = header[0].strip() name = name.replace('Councillor', '') name = re.sub(r'\(.+?\)', '', name) name = ' '.join(name.split()) district_id = ' '.join(header[1].split()[:2]) # needed a wacky xpath to deal with ward 8 photo = span.xpath('preceding::hr[1]/following::img[1]/@src') photo_url = urljoin(COUNCIL_PAGE, photo[0]) email = span.xpath( 'string(following::a[1]/text())') # can be empty p = Person(primary_org='legislature', name=name, district=district_id, role='Councillor') p.add_source(COUNCIL_PAGE) if email: p.add_contact('email', email) p.image = photo_url yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//h1[@class="title"]') for councillor in councillors: if ',' not in councillor.text_content(): continue name, district = councillor.text_content().split(',') name = name.strip() if 'Mayor' in district: p = Person(primary_org='legislature', name=name, district='Beaconsfield', role='Maire') p.add_source(COUNCIL_PAGE) p.image = councillor.xpath( './parent::div/parent::div/p//img/@src')[0] phone = councillor.xpath( './/parent::div/following-sibling::div[contains(text(), "514")]/text()' )[0] phone = phone.split(':')[1].strip().replace(' ', '-') p.add_contact('voice', phone, 'legislature') script = councillor.xpath( './/parent::div/following-sibling::div/script' )[0].text_content() p.add_contact('email', get_email(script)) yield p continue district = district.split('-')[1].strip() p = Person(primary_org='legislature', name=name, district=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.image = councillor.xpath( './parent::div/parent::div/p//img/@src')[0] phone = councillor.xpath( './/parent::div/following-sibling::p[contains(text(), "514")]/text()' ) if phone: phone = phone[0] phone = phone.split(':')[1].strip().replace(' ', '-') p.add_contact('voice', phone, 'legislature') script = councillor.xpath( './/parent::div/following-sibling::p/script')[0].text_content( ) p.add_contact('email', get_email(script)) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) nodes = page.xpath('//div[contains(@class,"cocis-has-caption")]')[1:] for node in nodes: url = urljoin(COUNCIL_PAGE, node.xpath('.//a[1]/@href')[0]) name = node.xpath('.//a//text()')[0] ward = ' '.join(node.xpath('.//strong//text()')[0].split()[:-1]) yield self.councillor_data(url, name, ward) mayor_node = page.xpath( '//div[contains(@class, "cocis-image-panel")]')[0] photo_url = urljoin(COUNCIL_PAGE, mayor_node.xpath('.//img/@src')[0]) name = mayor_node.xpath('.//a//text()')[0] mayor_page = self.lxmlize(MAYOR_PAGE) # Email behind mailhide # email = self.get_email(mayor_page) phone = self.get_phone(mayor_page, area_codes=[403]) m = Person(primary_org='legislature', name=name, district='Calgary', role='Mayor') m.add_source(COUNCIL_PAGE) m.add_source(MAYOR_PAGE) m.add_contact('voice', phone, 'legislature') m.image = photo_url yield m
def scrape(self): councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath( '//div[@id="content"]//table//tr[position() mod 2 = 1]') assert len(councillors), 'No councillors found' for councillor in councillors: text = councillor.xpath('.//strong/text()')[0] if 'Deputy Warden' in text: role = 'Deputy Warden' name = text.replace('Deputy Warden', '') district = 'Lambton' elif 'Warden' in text: role = 'Warden' name = text.replace('Warden', '') district = 'Lambton' else: role = 'Councillor' name = text district = 'Lambton (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('.//img/@src')[0] p.add_contact('email', self.get_email(councillor)) yield p
def scrape(self): csv_text = self.get(self.get_csv_url()).text cr = csv.DictReader(StringIO(csv_text)) for mla in cr: name = '{} {} {}'.format(mla['MLA First Name'], mla['MLA Middle Names'], mla['MLA Last Name']) if name.strip() == '': continue party = get_party(mla['Caucus']) name_without_status = name.split(',')[0] detail_url = ('http://www.assembly.ab.ca/net/index.aspx?' 'p=mla_contact&rnumber={0}&leg=29'.format( mla['Riding Number'])) detail_page = self.lxmlize(detail_url) photo_url = detail_page.xpath('//img[@class="MemPhoto"]/@src')[0] p = Person( primary_org='legislature', name=name_without_status, district=mla['Riding Name'], role='MLA', party=party, image=photo_url, ) p.add_source(COUNCIL_PAGE) p.add_source(detail_url) if mla['Email']: p.add_contact('email', mla['Email']) elif mla.get('MLA Email'): p.add_contact('email', mla['MLA Email']) if mla['Phone Number']: p.add_contact('voice', mla['Phone Number'], 'legislature') yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT, encoding='windows-1252') councillors = page.xpath('//table[@width="800"]/tr') assert len(councillors), 'No councillors found' for councillor in councillors: if councillor == councillors[0]: name = councillor.xpath('.//strong/text()')[0].replace( 'Monsieur', '').replace('Madame', '').strip() role = 'Maire' district = 'Mercier' else: name = councillor.xpath('.//strong/text()')[0].replace( 'Monsieur', '').replace('Madame', '').strip() role = 'Conseiller' district = 'District {}'.format( re.search(r'(\d)', councillor.xpath('.//text()')[3]).group(1)) email = self.get_email(councillor) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="field-item even"]//tr') assert len(councillors), 'No councillors found' for councillor in councillors: district = councillor.xpath('./td[1]//strong/text()')[0].replace( 'no. ', '') role = 'Conseiller' if 'Maire' in district: district = 'Senneville' role = 'Maire' name = councillor.xpath('./td[2]//p//text()')[0].title() email = self.get_email(councillor) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) try: p.image = councillor.xpath('.//img/@src')[0] except IndexError: pass p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table//td[*]') assert len(councillors), 'No councillors found' for councillor in councillors: district, role, name = councillor.xpath('./p[1]/text()') role = role.strip() if district == 'City of Oshawa': district = 'Oshawa' if role == 'City Councillor': role = 'Councillor' elif role == 'Regional & City Councillor': role = 'Regional Councillor' photo_url = councillor.xpath('./p/img/@src')[0] phone = self.get_phone( councillor.xpath('./p[contains(.//text(), "Phone")]')[0], area_codes=[905]) p = Person(primary_org='legislature', name=name, district=district, role=role, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('email', self.get_email(councillor)) yield p
def scrape(self): regional_councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) yield self.scrape_mayor(page) councillors = page.xpath('//h3[contains(text(), "Councillors")]/following-sibling::p')[:-1] assert len(councillors), 'No councillors found' for councillor_node in councillors: text = councillor_node.xpath('./strong/text()') if not text or 'Vacant' in text: continue name, role_district = text name = name.rstrip(',') if 'Regional Councillor' in role_district: role = role_district district = 'Whitby (seat {})'.format(regional_councillor_seat_number) regional_councillor_seat_number += 1 else: role, district = role_district.strip().split(', ') district = district.split(' (')[0] email = self.get_email(councillor_node) image = councillor_node.xpath('./img/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role=role, image=image) p.add_source(COUNCIL_PAGE) p.add_contact('email', email) yield p
def scrape(self): regional_councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//center/center//a') for councillor in councillors: name = councillor.text_content().strip() url = councillor.attrib['href'] page = self.lxmlize(url) header = page.xpath( '//div[@class="sectionheading"]')[0].text_content() if header == 'Mayor of Richmond Hill': district = 'Richmond Hill' role = 'Mayor' else: district = re.findall(r',(.*)-', header) if district: district = district[0].strip() else: district = 'Richmond Hill (seat {})'.format( regional_councillor_seat_number) regional_councillor_seat_number += 1 role = 'Regional Councillor' if 'Regional' in header else 'Councillor' info = page.xpath( '//table[@cellpadding>0]/tbody/tr/td[last()]|//table[not(@cellpadding)]/tbody/tr/td[last()]' ) info = info[0].text_content().replace(' - office:', ':') address = re.findall( r'(?<=Town of Richmond Hill)(.*(?=Telephone:)|(?=Telephone))', info)[0] address = re.sub(r'([a-z])([A-Z])', r'\1 \2', address) # I expected to be able to do '(.*)(?=\sTelephone|Telephone|Fax)', but nope. phone = re.findall( r'(?<=Telephone:) ((.*) (?=Telephone)|(.*)(?=Telephone)|(.*)(?=Fax))', info)[0][0].replace('(', '').replace(') ', '-').replace(', ext. ', ' x') fax = re.findall(r'(?<=Fax:) (.*)(?=E-mail)', info)[0].replace( ' ', '').replace('(', '').replace(')', '-') email = self.get_email(page) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email) p.image = page.xpath( '//img[contains(@alt, "{}")]/@src'.format(name))[0] if 'Website' in info: p.add_link(re.findall(r'www\..*\.[a-z]+', info)[0]) yield p
def scrape_councillor(self, url, district): infos_page = self.lxmlize(url) infos = infos_page.xpath('//div[@class="item-page"]')[0] name = ' '.join(infos.xpath('p[2]/text()')[0].split(' ')[1:3]) lname = name.lower() email = lname.split(' ')[0][0] + lname.split( ' ')[1] + '@langleycity.ca' photo_url = infos.xpath('p[1]/img/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role='Councillor', image=photo_url) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email) personal_infos = infos.xpath('p[last()]/text()') if 'Residence' in personal_infos[0]: phone = re.findall(r'(Phone|Res)(:?) (.*)', '\n'.join(personal_infos))[0][2] address = re.findall(r'Address: (.*) (Phone|Res)', ' '.join(personal_infos))[0][0] p.add_contact('address', address, 'residence') p.add_contact('voice', phone, 'residence') return p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath( '//div[contains(@class, "councillorwrapper")]') assert len(councillors), 'No councillors found' for index, councillor in enumerate(councillors): name = councillor.xpath('.//h4/text()')[0] district = councillor.xpath('.//h4/span/text()')[0].strip() role = 'Councillor' email = None if not district and index == 0: district = 'Calgary' role = 'Mayor' email = '*****@*****.**' p = Person(primary_org='legislature', name=name, district=district, role=role) p.image = councillor.xpath('.//@src')[0] if email: p.add_contact('email', email) p.add_source(COUNCIL_PAGE) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'utf-8') councillors = page.xpath('//div[contains(@class, "member-box member-box--")]') assert len(councillors), 'No councillors found' for councillor in councillors: name = councillor.xpath('.//div[@class="fiche__name"]/text()')[0] phone = councillor.xpath('.//div[@class="fiche__social"]/span/text()')[0].split('T')[1] email_mailto = councillor.xpath('.//div[@class="fiche__social"]/a[contains(@href, "mailto")]/@href') photo_url = councillor.xpath('.//img')[0].attrib['src'] page = self.lxmlize(councillor.xpath('.//a[@class="member-box__calltoaction"]/@href')[0]) district = page.xpath('.//div[@class="fiche__category"]/text()')[0] if district == 'Maire': district = 'Terrebonne' role = 'Maire' else: district = 'District {}'.format(district) role = 'Conseiller' p = Person(primary_org='legislature', name=name, district=district, role=role, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') if email_mailto: email = email_mailto[0].split('mailto:')[1] p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'utf-8') councillors = page.xpath('//td[@width="105"]') assert len(councillors), 'No councillors found' for node in councillors: url = urljoin(COUNCIL_PAGE, node.xpath('.//a/@href')[0]) ward = re.search('([A-Z].+) Ward', node.xpath('.//a//text()')[0]).group(1) ward = ward.replace(' – ', '—').replace( ' - ', '—') # n-dash, m-dash, hyphen, m-dash ward = ward.replace('St. Norbert', 'St Norbert') # to match ocd-division-ids name = ' '.join(node.xpath('.//span[@class="k80B"][1]/text()')) yield self.councillor_data(url, name, ward) mayor_node = page.xpath('//td[@width="315"]')[0] mayor_name = mayor_node.xpath('./a//text()')[0][len('Mayor '):] mayor_photo_url = mayor_node.xpath('./img/@src')[0] m = Person(primary_org='legislature', name=mayor_name, district='Winnipeg', role='Mayor') m.add_source(COUNCIL_PAGE) # @see http://www.winnipeg.ca/interhom/mayor/MayorForm.asp?Recipient=CLK-MayorWebMail m.add_contact('email', '*****@*****.**') # hardcoded m.image = mayor_photo_url yield m
def scrape_mayor(self, url): infos_page = self.lxmlize(url) infos = infos_page.xpath('//div[@class="item-page"]')[0] name = ' '.join(infos.xpath('p[2]/text()')[0].split(' ')[2:4]) lname = name.lower() email = lname.split(' ')[0][0] + lname.split( ' ')[1] + '@langleycity.ca' photo_url = infos.xpath('p[1]/img/@src')[0] p = Person(primary_org='legislature', name=name, district='Langley', role='Mayor', image=photo_url) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email) personal_infos = infos.xpath('p[last()]/text()') phone = re.findall(r'Phone(:?) (.*)', '\n'.join(personal_infos))[0][1] address = re.findall(r'Address: (.*) Phone', ' '.join(personal_infos))[0] p.add_contact('address', address, 'office') p.add_contact('voice', phone, 'office') return p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table[@id="Table1table"]/tbody/tr') assert len(councillors), 'No councillors found' for councillor in councillors: name = councillor.xpath('./td[2]/p/text()')[1] role = councillor.xpath('./td[2]/p/text()')[0].strip() if role == 'Mayor and Regional Councillor': role = 'Mayor' elif role == 'Local & Regional Councillor': role = 'Regional Councillor' elif role == 'Local Councillor': role = 'Councillor' if len(councillor.xpath('./td[2]/p/text()')) < 3: district = 'Milton' else: district = councillor.xpath('./td[2]/p/text()')[2] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./td[1]/p//img/@src')[0] if councillor == councillors[0]: address = ', '.join(councillor.xpath('./td[3]/p[1]/text()')).replace('Email:', '').strip() p.add_contact('address', address, 'legislature') numbers = councillor.xpath('./td[3]/p[2]/text()') for number in numbers: num_type, number = number.split(':') number = number.replace(', ext ', ' x').strip() p.add_contact(num_type, number, num_type) yield p
def scrape(self): councillor_seat_number = 1 contact_page = self.lxmlize(CONTACT_URL) email = self.get_email(contact_page) page = self.lxmlize(COUNCIL_PAGE) urls = page.xpath('//a/@href[contains(., "members/")]') assert len(urls), 'No councillors found' for url in urls: page = self.lxmlize(url) role, name = page.xpath('//h1//text()')[0].split(' ', 1) photo_url = page.xpath('//div[@id="content"]//img/@src')[0] if role == 'Mayor': district = 'Richmond' else: district = 'Richmond (seat {})'.format(councillor_seat_number) councillor_seat_number += 1 p = Person(primary_org='legislature', name=name, district=district, role=role) p.image = photo_url p.add_source(COUNCIL_PAGE) p.add_source(CONTACT_URL) p.add_source(url) p.add_contact('email', email) # same for all yield p
def scrape_mayor(self): page = self.lxmlize(MAYOR_PAGE, 'iso-8859-1') name = page.xpath( '//div[@class="articletitle"]/h1')[0].text_content().replace( 'Mayor', '') p = Person(primary_org='legislature', name=name, district='Summerside', role='Mayor') p.add_source(MAYOR_PAGE) p.image = page.xpath( '//div[@class="articlebody-inside"]/p/img/@src')[0].replace( '..', '') info = page.xpath('//div[@class="articlebody-inside"]/p') phone = re.findall(r'to (.*)', info[1].text_content())[0] address = info[3].text_content().replace( 'by mail: ', '') + ' ' + info[4].text_content() email = self.get_email(info[5]) p.add_contact('voice', phone, 'legislature') p.add_contact('address', address, 'legislature') p.add_contact('email', email) return p
def scrape_mayor(self, div): name = div.xpath('.//a')[0].text_content() url = div.xpath('.//a/@href')[0] page = self.lxmlize(url) contact_url = page.xpath('//a[@title="Joindre le maire"]/@href')[0] contact_page = self.lxmlize(contact_url) p = Person(primary_org='legislature', name=name, district='Saint-Jean-sur-Richelieu', role='Maire') p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_source(contact_url) p.image = div.xpath('./preceding-sibling::td//img/@src')[-1] contacts = contact_page.xpath( '//div[@id="ctl00_PlaceHolderMain_ctl01_ctl01__ControlWrapper_RichHtmlField"]//div/font/text()' ) address = ' '.join(contacts[:4]) phone = contacts[-3].split(':')[1].strip().replace(' ', '-') fax = contacts[-2].split(':')[1].strip().replace(' ', '-') p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') # mayor's email is a form return p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) mayor_info = page.xpath('//h2[contains(text(), "MAYOR")]//following-sibling::p')[0] yield self.scrape_mayor(mayor_info) wards = page.xpath('//h3') for ward in wards: district = re.sub('\AWARD \d+ - ', '', ward.text_content()) councillors = ward.xpath('following-sibling::p') for councillor in councillors: name = councillor.xpath('./strong')[0].text_content() p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) info = councillor.xpath('./text()') address = info.pop(0) p.add_contact('address', address, 'legislature') # get phone numbers for line in info: stuff = re.split(r'(\xbb)|(\xa0)', line) tmp = [y for y in stuff if y and not re.match(r'\xa0', y)] self.get_tel_numbers(tmp, p) email = self.get_email(councillor) p.add_contact('email', email) yield p if councillor == councillors[1]: break
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, 'iso-8859-1') councillors = page.xpath('//div[@id="PageContent"]/table/tbody/tr/td') assert len(councillors), 'No councillors found' for councillor in councillors: if not councillor.text_content().strip(): continue if councillor == councillors[0]: district = 'Kirkland' role = 'Maire' else: district = councillor.xpath('.//h2')[0].text_content() district = re.search('- (.+)', district).group(1).strip() district = district.replace(' Ouest', ' ouest').replace(' Est', ' est') role = 'Conseiller' name = councillor.xpath('.//strong/text()')[0] phone = councillor.xpath( './/div[contains(text(), "#")]/text()')[0].replace( 'T ', '').replace(' ', '-').replace(',-#-', ' x') email = self.get_email(councillor) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email) p.image = councillor.xpath('.//img/@src')[0] yield p
def scrape(self): councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) nodes = page.xpath('//div[@class="view-content"]/div') for node in nodes: fields = node.xpath('./div') role = fields[0].xpath('./div//text()')[0] name = fields[2].xpath('.//a//text()')[0].title().split(role)[-1].strip() if name == 'Vacant': continue if 'Ward' in role: district = role role = 'Councillor' else: if 'At Large' in role: role = 'Councillor at Large' district = "St. John's (seat {})".format(councillor_seat_number) councillor_seat_number += 1 else: district = "St. John's" phone = fields[3].xpath('./div//text()')[0] email = self.get_email(fields[5]) photo_url = node.xpath('.//img/@src')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email) p.image = photo_url yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="block text"]') assert len(councillors), 'No councillors found' for i, councillor in enumerate(councillors): name = councillor.xpath( './/div[@class="content-writable"]//strong/text()')[0] district = councillor.xpath('.//h2/text()')[0] if 'Maire' in district: district = 'Sainte-Anne-de-Bellevue' role = 'Maire' else: district = 'District {}'.format(re.search(r'\d+', district)[0]) role = 'Conseiller' p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('.//@src')[0] p.add_contact('email', self.get_email(councillor)) yield p
def scrape(self): member_page = self.lxmlize(COUNCIL_PAGE, encoding='utf-8') table = member_page.xpath('//table')[0] rows = table.xpath('.//tr')[1:] assert len(rows), 'No members found' for row in rows: (namecell, constitcell, partycell) = row.xpath('.//td') full_name = namecell.text_content().strip() if full_name.lower() == 'vacant': continue (last, first) = full_name.split(',') name = first.replace('Hon.', '').strip() + ' ' + last.title().strip() district = ' '.join(constitcell.text_content().split()) party = get_party(partycell.text) url = namecell.xpath('.//a')[0].get('href') page = self.lxmlize(url) email = self.get_email(page) p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=party) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email) image = page.xpath('//img[@class="page_graphic"]/@src') if image: p.image = image[0] yield p