def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table[@id="MLAs"]//tr')[1:] for councillor in councillors: if 'Vacant' not in councillor.xpath('./td')[0].text_content(): name = councillor.xpath('./td')[0].text_content().split('. ', 1)[1] party = councillor.xpath('./td')[1].text district = councillor.xpath('./td')[2].text_content() url = councillor.xpath('./td[1]/a/@href')[0] page = self.lxmlize(url) p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=party) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('//div[contains(@class, "mla-image-cell")]/img/@src')[0] contact = page.xpath('//div[@id="mla-contact"]/div[2]')[0] website = contact.xpath('./div[3]/div[3]/div[2]/a') if website: p.add_link(website[0].text_content()) p.add_contact('address', ' '.join(contact.xpath('.//div[@class="col-md-4"][2]/div//text()')[1:9]), 'constituency') phone_leg = contact.xpath('.//span[@id="MainContent_ContentBottom_Property6"]//text()')[0] phone_const = contact.xpath('.//div[@class="col-md-4"]/div[4]/span/span/text()')[0] p.add_contact('voice', phone_leg, 'legislature', area_code=306) p.add_contact('voice', phone_const, 'constituency', area_code=306) email = self.get_email(contact) p.add_contact('email', email) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="entry-content"]//p/strong') for councillor in councillors: district = councillor.xpath('./ancestor::p/preceding-sibling::h2')[-1].text_content().split('–'.decode('utf-8'))[0] name = ' '.join(councillor.text_content().split()[-2:]).replace('-Â'.decode('utf-8'), '') role = councillor.text_content().replace(name, '').split('-')[0] if 'SAO' in role or not role: continue org = Organization(name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) yield org p = Person(primary_org='legislature', name=name, district=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(org, role=role, district=district) info = councillor.xpath('./ancestor::p/text()') for contact in info: if 'NT' in contact: membership.add_contact_detail('address', contact.strip(), 'legislature') if 'Tel' in contact: contact = contact.replace('Tel. ', '').replace('(', '').replace(') ', '-').strip() membership.add_contact_detail('voice', contact, 'legislature') if 'Fax' in contact: contact = contact.replace('Fax ', '').replace('(', '').replace(') ', '-').strip() membership.add_contact_detail('fax', contact, 'legislature') email = self.get_email(councillor, './parent::p') membership.add_contact_detail('email', email) if 'Website' in councillor.xpath('./parent::p')[0].text_content(): p.add_link(councillor.xpath('./parent::p//a')[1].attrib['href']) yield p
def scrape(self): regional_councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//center/center//a') for councillor in councillors: name = councillor.text_content().strip() url = councillor.attrib['href'] page = self.lxmlize(url) header = page.xpath( '//div[@class="sectionheading"]')[0].text_content() if header == 'Mayor of Richmond Hill': district = 'Richmond Hill' role = 'Mayor' else: district = re.findall(r',(.*)-', header) if district: district = district[0].strip() else: district = 'Richmond Hill (seat {})'.format( regional_councillor_seat_number) regional_councillor_seat_number += 1 role = 'Regional Councillor' if 'Regional' in header else 'Councillor' info = page.xpath( '//table[@cellpadding>0]/tbody/tr/td[last()]|//table[not(@cellpadding)]/tbody/tr/td[last()]' ) info = info[0].text_content().replace(' - office:', ':') address = re.findall( r'(?<=Town of Richmond Hill)(.*(?=Telephone:)|(?=Telephone))', info)[0] address = re.sub(r'([a-z])([A-Z])', r'\1 \2', address) # I expected to be able to do '(.*)(?=\sTelephone|Telephone|Fax)', but nope. phone = re.findall( r'(?<=Telephone:) ((.*) (?=Telephone)|(.*)(?=Telephone)|(.*)(?=Fax))', info)[0][0].replace('(', '').replace(') ', '-').replace(', ext. ', ' x') fax = re.findall(r'(?<=Fax:) (.*)(?=E-mail)', info)[0].replace( ' ', '').replace('(', '').replace(')', '-') email = self.get_email(page) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email) p.image = page.xpath( '//img[contains(@alt, "{}")]/@src'.format(name))[0] if 'Website' in info: p.add_link(re.findall(r'www\..*\.[a-z]+', info)[0]) yield p
def scrape(self): seat_numbers = defaultdict(int) page = self.lxmlize(COUNCIL_PAGE) yield self.scrape_mayor() councillors = page.xpath('//div[@id="centre_content"]//tr') for councillor in councillors: if 'Position' in councillor.text_content(): continue ward = councillor.xpath('./td')[0].text_content().replace('Councillor', '') seat_numbers[ward] += 1 district = '{} (seat {})'.format(ward, seat_numbers[ward]) name = councillor.xpath('./td')[1].text_content() url = councillor.xpath('./td/a')[0].attrib['href'] p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) page = self.lxmlize(url) content = page.xpath('//div[@id="centre_content"]')[0] email = self.get_email(content) p.add_contact('email', email) p.add_contact('voice', self.get_phone(content, area_codes=[226, 519]), 'legislature') p.image = page.xpath('string(//div[@id="centre_content"]//img/@src)') # can be empty if len(page.xpath('//div[@id="centre_content"]//a')) > 2: p.add_link(page.xpath('//div[@id="centre_content"]//a')[-1].attrib['href']) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="printArea"]//table//tr//td')[4:-1] yield self.scrape_mayor(councillors[0]) for councillor in councillors[1:]: name = ' '.join(councillor.xpath('.//strong/a[last()]//text()')[0].split()) infostr = councillor.xpath('.//strong//text()')[0] try: district = infostr.split('-')[1] role = 'Councillor' except IndexError: district = 'Newmarket' role = 'Regional Councillor' url = councillor.xpath('.//a/@href')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = councillor.xpath('.//img/@src')[0] page = self.lxmlize(url) info = page.xpath('//div[@id="printArea"]')[0] info = info.xpath('.//p[@class="heading"][2]/following-sibling::p') address = info.pop(0).text_content().strip() if not address: address = info.pop(0).text_content().strip() if 'Ward' in info[0].text_content(): info.pop(0) numbers = info.pop(0).text_content().split(':') email = self.get_email(page) p.add_contact('email', email) for i, contact in enumerate(numbers): if i == 0: continue if '@' in contact: continue # executive assistant email else: number = re.findall(r'([0-9]{3}-[0-9]{3}-[0-9]{4})', contact)[0] ext = re.findall(r'(Ext\. [0-9]{3,4})', contact) if ext: number = number + ext[0].replace('Ext. ', ' x') contact_type = re.findall(r'[A-Za-z]+$', numbers[i - 1])[0] if 'Fax' in contact_type: p.add_contact('fax', number, 'legislature') elif 'Phone' in contact_type: p.add_contact('voice', number, 'legislature') else: p.add_contact(contact_type, number, contact_type) site = page.xpath('.//a[contains(text(), "http://")]') if site: p.add_link(site[0].text_content()) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) members = page.xpath('//table[@id="MLAs"]//tr')[1:] assert len(members), 'No members found' for member in members: if 'Vacant' not in member.xpath('./td')[0].text_content(): name = member.xpath('./td')[0].text_content().split('. ', 1)[1] party = member.xpath('./td')[1].text district = member.xpath('./td')[2].text_content() url = member.xpath('./td[1]/a/@href')[0] page = self.lxmlize(url) p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=party) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath( '//div[contains(@class, "mla-image-cell")]/img/@src')[0] contact = page.xpath('//div[@id="mla-contact"]/div[2]')[0] website = contact.xpath('./div[3]/div[3]/div[2]/a') if website: p.add_link(website[0].text_content()) p.add_contact( 'address', ' '.join( contact.xpath( './/div[@class="col-md-4"][2]/div//text()')[1:9]), 'constituency') phone_leg = contact.xpath( './/span[@id="MainContent_ContentBottom_Property6"]//text()' ) if phone_leg: p.add_contact('voice', phone_leg[0], 'legislature', area_code=306) phone_const = contact.xpath( './/div[@class="col-md-4"]/div[4]/span/span/text()') if phone_const: p.add_contact('voice', phone_const[0], 'constituency', area_code=306) email = self.get_email(contact, error=False) if email: p.add_contact('email', email) yield p
def scrape(self): regional_councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) mayor_url = page.xpath( '//a[contains(text(), "Office of the Mayor")]/@href')[0] yield self.scrape_mayor(mayor_url) councillors = page.xpath( '//div[@class="interiorContentWrapper"]//td[./a]') assert len(councillors), 'No councillors found' for councillor in councillors: name_elem = ' '.join(councillor.xpath('.//strong/text()')) if 'Mayor' in name_elem: name = name_elem.split('Mayor')[1] elif 'Councillor' in name_elem: name = name_elem.split('Councillor')[1] else: name = name_elem district = councillor.xpath('.//a//text()[normalize-space()]')[0] if 'Ward' in district: district = district.replace('Councillor', '') role = 'Councillor' elif 'Regional' in district: role = 'Regional Councillor' district = 'Markham (seat {})'.format( regional_councillor_seat_number) regional_councillor_seat_number += 1 else: role = district district = 'Markham' image = councillor.xpath('.//img/@src')[0] url = councillor.xpath('.//a/@href')[0] address, phone, email, links = self.get_contact(url) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = image p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('email', email) for link in links: p.add_link(link) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath( '//ul[@class="subNav top"]/li/ul//li/a[contains(text(), "Councillor")] | //ul[@class="subNav top"]/li/ul//li/a[contains(text(), "Mayor")]' ) assert len(councillors), 'No councillors found' for councillor in councillors: name = councillor.text_content() url = councillor.attrib['href'] page = self.lxmlize(url) if councillor == councillors[0]: district = 'Ajax' role = 'Mayor' else: district = re.findall( r'Ward.*', page.xpath('//div[@id="printAreaContent"]//h1') [0].text_content())[0].strip() role = page.xpath( '//div[@id="printAreaContent"]//h1')[0].text_content() role = re.search('((?:Regional )?Councillor)', role).group(1) name = name.replace(role, '').strip() p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath( '//div[@class="intQuicklinksPhoto"]//img/@src')[0] contact_info = page.xpath('//table[@class="datatable"][1]//tr')[1:] for line in contact_info: contact_type = line.xpath('./td')[0].text_content().strip() if re.match(r'(Home)|(Cell)|(Phone)|(Fax)|(Email)', contact_type): contact = line.xpath('./td')[1].text_content().strip() contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type] p.add_contact( contact_type, contact, '' if contact_type == 'email' else 'legislature') elif contact_type == 'Address': contact = ''.join(line.xpath('./td[2]//text()')).strip() p.add_contact(contact_type, contact, 'legislature') else: contact = line.xpath('./td[2]/a/@href')[0] p.add_link(contact) yield p
def scrape(self): yield self.scrape_mayor() page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath( '//div[contains(@class, "documentexcerpt-module__item")]') assert len(councillors), 'No councillors found' for cell in councillors: name = cell[1].text if name == 'Vacant': continue page_url = cell[0].attrib['href'] page = self.lxmlize(page_url) district_name = page.xpath( '//h1[contains(@class, "page-title")]')[0].text_content() district, name = district_name.split(' - ', 1) p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(page_url) image = page.xpath('//div[contains(@class, "content")]//img/@src') if image: p.image = image[0] address = page.xpath('//address//p') if address: address = address[0].text_content() p.add_contact('address', address, 'legislature') contacts = page.xpath( '//table[@summary="Contact information"]//tr') for contact in contacts: contact_type = contact.xpath('./th/text()')[0] value = contact.xpath('./td//text()')[0] if 'Title' in contact_type: continue elif 'Website' in contact_type or 'Facebook' in contact_type or 'Twitter' in contact_type: value = contact.xpath('./td/a/text()')[0] p.add_link(value) elif 'Telephone' in contact_type: p.add_contact('voice', value, 'legislature') elif 'Fax' in contact_type: p.add_contact('fax', value, 'legislature') elif 'Email' in contact_type: p.add_contact('email', value) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) types = page.xpath('//div[@class="bluearrow shaded bottomborder "][1]/ul/li/a/@href')[:4] for org_type, link in enumerate(types): page = self.lxmlize(link) district_urls = page.xpath('//div[@class="parbase list section cplist"]/table/tr/td[1]/b/a/@href') for district_url in district_urls: page = self.lxmlize(district_url) district = page.xpath('//div[@class="pageHeader"]/h1/text()')[0].split(' - ')[1].strip() org = Organization(name=district + org_types[org_type], classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(district_url) yield org address = ', '.join(page.xpath('//div[@class="left_contents"]/p[1]/text()')) contacts = page.xpath('//div[@class="left_contents"]/p[b[text() = "Contact"]]/text()') phone = contacts[0].split(':')[1].strip().replace(' ', '-') fax = contacts[1].split(':')[1].strip().replace(' ', '-') email = self.get_email(page, '//div[@class="left_contents"]') site = page.xpath('//div[@class="left_contents"]//a[not(contains(@href,"mailto:"))]') if site: site = site[0].text_content() councillors = page.xpath('//div[@class="right_contents"]//p/text()') for i, councillor in enumerate(councillors): if 'Vacant' in councillor: continue p = Person(primary_org='legislature', name=councillor, district=district) p.add_source(COUNCIL_PAGE) p.add_source(link) p.add_source(district_url) if i == 0: membership = p.add_membership(org, role='Mayor') else: membership = p.add_membership(org, role='Councillor') membership.post_id = district membership.add_contact_detail('address', address, 'legislature') if phone: membership.add_contact_detail('voice', phone, 'legislature') if fax: membership.add_contact_detail('fax', fax, 'legislature') if email: membership.add_contact_detail('email', email) if site: p.add_link(site) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="entry-content"]//p/strong') for councillor in councillors: district = councillor.xpath('./ancestor::p/preceding-sibling::h2' )[-1].text_content().split( '–'.decode('utf-8'))[0] name = ' '.join(councillor.text_content().split()[-2:]).replace( '-Â'.decode('utf-8'), '') role = councillor.text_content().replace(name, '').split('-')[0] if 'SAO' in role or not role: continue org = Organization( name=district + ' Municipal Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(COUNCIL_PAGE) yield org p = Person(primary_org='legislature', name=name, district=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(org, role=role, district=district) info = councillor.xpath('./ancestor::p/text()') for contact in info: if 'NT' in contact: membership.add_contact_detail('address', contact.strip(), 'legislature') if 'Tel' in contact: contact = contact.replace('Tel. ', '').replace('(', '').replace( ') ', '-').strip() membership.add_contact_detail('voice', contact, 'legislature') if 'Fax' in contact: contact = contact.replace('Fax ', '').replace('(', '').replace( ') ', '-').strip() membership.add_contact_detail('fax', contact, 'legislature') email = self.get_email(councillor, './parent::p') membership.add_contact_detail('email', email) if 'Website' in councillor.xpath('./parent::p')[0].text_content(): p.add_link( councillor.xpath('./parent::p//a')[1].attrib['href']) yield p
def scrape(self): regional_councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) mayor_url = page.xpath('//a[contains(text(), "Office of the Mayor")]/@href')[0] yield self.scrape_mayor(mayor_url) councillors = page.xpath('//div[@class="interiorContentWrapper"]//td[./a]') assert len(councillors), 'No councillors found' for councillor in councillors: name_elem = ' '.join(councillor.xpath('.//strong/text()')) if 'Mayor' in name_elem: name = name_elem.split('Mayor')[1] elif 'Councillor' in name_elem: name = name_elem.split('Councillor')[1] else: name = name_elem district = councillor.xpath('.//a//text()[normalize-space()]')[0] if 'Ward' in district: district = district.replace('Councillor', '') role = 'Councillor' elif 'Regional' in district: role = 'Regional Councillor' district = 'Markham (seat {})'.format(regional_councillor_seat_number) regional_councillor_seat_number += 1 else: role = district district = 'Markham' image = councillor.xpath('.//img/@src')[0] url = councillor.xpath('.//a/@href')[0] address, phone, email, links = self.get_contact(url) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = image p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('email', email) for link in links: p.add_link(link) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) mayor_contacts = page.xpath('//table[1]//tr/td[1]/text()') council_contacts = page.xpath('//table[1]//tr/td[2]/text()') councillors = page.xpath( '//table[@id="Table3table"]//img/ancestor::td') assert len(councillors), 'No councillors found' for councillor in councillors: name = councillor.xpath('.//strong//text()')[0] if 'Councillor' in name: name = name.replace('Councillor', '').strip() role_ward = councillor.xpath('./text()')[0] if not role_ward.strip(): role_ward = councillor.xpath('.//p/text()')[0] role_ward = role_ward.split(' ') role = re.sub('\ACity ', '', ' '.join(role_ward[:2])) ward = ' '.join(role_ward[2:]) else: name = councillor.xpath('.//strong/text()')[1] role = 'Mayor' ward = 'Pickering' email = self.get_email(councillor) p = Person(primary_org='legislature', name=name, district=ward, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('email', email) p.image = councillor.xpath('.//img/@src')[0] links = councillor.xpath('.//a') for link in links: if '@' in link.text_content(): continue if 'Profile' in link.text_content(): p.add_source(link.attrib['href']) else: p.add_link(link.attrib['href']) if role == 'Mayor': add_contacts(p, mayor_contacts) else: add_contacts(p, council_contacts) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) districts = page.xpath('//div[@id="left-content" or @id="right-content"]//a') for district in districts: url = district.attrib['href'] page = self.lxmlize(url) org = Organization(name=district.text_content() + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(url) yield org info = page.xpath('//div[@style="WIDTH:750"]/dl') for contact in info: contact_type = contact.xpath('./dt')[0].text_content() contact = contact.xpath('./dd')[0].text_content().replace('(', '').replace(') ', '-') if 'Officials' in contact_type: break if 'Tel' in contact_type: phone = contact if 'Fac' in contact_type: fax = contact if 'Address' in contact_type: address = contact if 'Email' in contact_type: email = contact if 'Website' in contact_type: site = contact councillors = page.xpath('//div[@style="WIDTH:750"]/dl/dt[contains(text(), "Elected Officials")]/parent::dl/dd/pre/text()')[0].splitlines(True) for councillor in councillors: name = councillor.replace('(Mayor)', '').replace('(Deputy Mayor)', '').replace('(Chairperson)', '').strip() role = re.sub(r'\(|\)', '', councillor.replace(name, '').strip()) if not role: role = 'Councillor' p = Person(primary_org='legislature', name=name, district=district.text_content()) p.add_source(COUNCIL_PAGE) p.add_source(url) membership = p.add_membership(org, role=role, district=district.text_content()) membership.add_contact_detail('voice', self.clean_telephone_number(phone), 'legislature') membership.add_contact_detail('fax', self.clean_telephone_number(fax), 'legislature') membership.add_contact_detail('address', self.clean_address(address), 'legislature') membership.add_contact_detail('email', email) if site: p.add_link(site) yield p
def scrape(self): seat_numbers = defaultdict(int) page = self.lxmlize(COUNCIL_PAGE) yield self.scrape_mayor() councillors = page.xpath('//div[@id="centre_content"]//tr') assert len(councillors), 'No councillors found' for councillor in councillors: if 'Position' in councillor.text_content(): continue ward = councillor.xpath('./td')[0].text_content().replace( 'Councillor', '') seat_numbers[ward] += 1 district = '{} (seat {})'.format(ward, seat_numbers[ward]) name = councillor.xpath('./td')[1].text_content() url = councillor.xpath('./td/a')[0].attrib['href'] p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) page = self.lxmlize(url) content = page.xpath('//div[@id="centre_content"]')[0] email = self.get_email(content) p.add_contact('email', email) p.add_contact('voice', self.get_phone(content, area_codes=[226, 519]), 'legislature') p.image = page.xpath( 'string(//div[@id="centre_content"]//img/@src)' ) # can be empty if len(page.xpath('//div[@id="centre_content"]//a')) > 2: p.add_link( page.xpath('//div[@id="centre_content"]//a') [-1].attrib['href']) yield p
def scrape(self): yield self.scrape_mayor() page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//th[contains(text(), "Ward")]') assert len(councillors), 'No councillors found' for cell in councillors: district = cell.text name = cell[1].text if name != 'Vacant': page_url = cell[1].attrib['href'] page = self.lxmlize(page_url) p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(page_url) image = page.xpath('//div[@id="contentArea"]//img/@src') if image: p.image = image[0] address = page.xpath('//address//p') if address: address = address[0].text_content() p.add_contact('address', address, 'legislature') contacts = page.xpath('//table[@class="contactListing"]//tr') for contact in contacts: contact_type = contact.xpath('./th/text()')[0] value = contact.xpath('./td//text()')[0] if 'Title' in contact_type: continue elif 'Website' in contact_type or 'Facebook' in contact_type or 'Twitter' in contact_type: value = contact.xpath('./td/a/text()')[0] p.add_link(value) elif 'Telephone' in contact_type: p.add_contact('voice', value, 'legislature') elif 'Fax' in contact_type: p.add_contact('fax', value, 'legislature') elif 'Email' in contact_type: p.add_contact('email', value) yield p
def scrape(self): regional_councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//center/center//a') for councillor in councillors: name = councillor.text_content().strip() url = councillor.attrib['href'] page = self.lxmlize(url) header = page.xpath('//div[@class="sectionheading"]')[0].text_content() if header == 'Mayor of Richmond Hill': district = 'Richmond Hill' role = 'Mayor' else: district = re.findall(r',(.*)-', header) if district: district = district[0].strip() else: district = 'Richmond Hill (seat {})'.format(regional_councillor_seat_number) regional_councillor_seat_number += 1 role = 'Regional Councillor' if 'Regional' in header else 'Councillor' info = page.xpath('//table[@cellpadding>0]/tbody/tr/td[last()]|//table[not(@cellpadding)]/tbody/tr/td[last()]') info = info[0].text_content().replace(' - office:', ':') address = re.findall(r'(?<=Town of Richmond Hill)(.*(?=Telephone:)|(?=Telephone))', info)[0] address = re.sub(r'([a-z])([A-Z])', r'\1 \2', address) # I expected to be able to do '(.*)(?=\sTelephone|Telephone|Fax)', but nope. phone = re.findall(r'(?<=Telephone:) ((.*) (?=Telephone)|(.*)(?=Telephone)|(.*)(?=Fax))', info)[0][0].replace('(', '').replace(') ', '-').replace(', ext. ', ' x') fax = re.findall(r'(?<=Fax:) (.*)(?=E-mail)', info)[0].replace(' ', '').replace('(', '').replace(')', '-') email = self.get_email(page) p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email) p.image = page.xpath('//img[contains(@alt, "{}")]/@src'.format(name))[0] if 'Website' in info: p.add_link(re.findall(r'www\..*\.[a-z]+', info)[0]) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) mayor_contacts = page.xpath('//table[1]//tr/td[1]/text()') council_contacts = page.xpath('//table[1]//tr/td[2]/text()') councillors = page.xpath('//table[@id="Table3table"]//img/ancestor::td') assert len(councillors), 'No councillors found' for councillor in councillors: name = councillor.xpath('.//strong//text()')[0] if 'Councillor' in name: name = name.replace('Councillor', '').strip() role_ward = councillor.xpath('./text()')[0] if not role_ward.strip(): role_ward = councillor.xpath('.//p/text()')[0] role_ward = role_ward.split(' ') role = re.sub(r'\ACity ', '', ' '.join(role_ward[:2])) ward = ' '.join(role_ward[2:]) else: name = councillor.xpath('.//strong/text()')[1] role = 'Mayor' ward = 'Pickering' email = self.get_email(councillor) p = Person(primary_org='legislature', name=name, district=ward, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('email', email) p.image = councillor.xpath('.//img/@src')[0] links = councillor.xpath('.//a') for link in links: if '@' in link.text_content(): continue if 'Profile' in link.text_content(): p.add_source(link.attrib['href']) else: p.add_link(link.attrib['href']) if role == 'Mayor': add_contacts(p, mayor_contacts) else: add_contacts(p, council_contacts) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath( '//ul[@class="subNav top"]/li/ul//li/a[contains(text(), "Councillor")] | //ul[@class="subNav top"]/li/ul//li/a[contains(text(), "Mayor")]' ) for councillor in councillors: name = councillor.text_content() url = councillor.attrib["href"] page = self.lxmlize(url) if councillor == councillors[0]: district = "Ajax" role = "Mayor" else: district = re.findall(r"Ward.*", page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content())[ 0 ].strip() role = page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content() role = re.search("((?:Regional )?Councillor)", role).group(1) name = name.replace(role, "").strip() p = Person(primary_org="legislature", name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('//div[@class="intQuicklinksPhoto"]//img/@src')[0] contact_info = page.xpath('//table[@class="datatable"][1]//tr')[1:] for line in contact_info: contact_type = line.xpath("./td")[0].text_content().strip() if re.match(r"(Home)|(Cell)|(Phone)|(Fax)|(Email)", contact_type): contact = line.xpath("./td")[1].text_content().strip() contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type] p.add_contact(contact_type, contact, "" if contact_type == "email" else "legislature") elif contact_type == "Address": contact = "".join(line.xpath("./td[2]//text()")).strip() p.add_contact(contact_type, contact, "legislature") else: contact = line.xpath("./td[2]/a/@href")[0] p.add_link(contact) yield p
def scrape(self): yield self.scrape_mayor() page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[contains(@class, "documentexcerpt-module__item")]') assert len(councillors), 'No councillors found' for cell in councillors: name = cell[1].text if name != 'Vacant': page_url = cell[0].attrib['href'] page = self.lxmlize(page_url) district_name = page.xpath('//h1[contains(@class, "page-title")]')[0].text_content() district, name = district_name.split(' - ', 1) p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(page_url) image = page.xpath('//div[contains(@class, "content")]//img/@src') if image: p.image = image[0] address = page.xpath('//address//p') if address: address = address[0].text_content() p.add_contact('address', address, 'legislature') contacts = page.xpath('//table[@summary="Contact information"]//tr') for contact in contacts: contact_type = contact.xpath('./th/text()')[0] value = contact.xpath('./td//text()')[0] if 'Title' in contact_type: continue elif 'Website' in contact_type or 'Facebook' in contact_type or 'Twitter' in contact_type: value = contact.xpath('./td/a/text()')[0] p.add_link(value) elif 'Telephone' in contact_type: p.add_contact('voice', value, 'legislature') elif 'Fax' in contact_type: p.add_contact('fax', value, 'legislature') elif 'Email' in contact_type: p.add_contact('email', value) yield p
def scrape(self): yield self.scrape_mayor() page = self.lxmlize(COUNCIL_PAGE) councillor_cells = page.xpath('//th[contains(text(), "Ward")]') for cell in councillor_cells: district = cell.text name = cell[1].text if name != 'Vacant': page_url = cell[1].attrib['href'] page = self.lxmlize(page_url) p = Person(primary_org='legislature', name=name, district=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(page_url) image = page.xpath('//div[@id="contentArea"]//img/@src') if image: p.image = image[0] address = page.xpath('//address//p') if address: address = address[0].text_content() p.add_contact('address', address, 'legislature') contacts = page.xpath('//table[@class="contactListing"]//tr') for contact in contacts: contact_type = contact.xpath('./th/text()')[0] value = contact.xpath('./td//text()')[0] if 'Title' in contact_type: continue elif 'Website' in contact_type or 'Facebook' in contact_type or 'Twitter' in contact_type: value = contact.xpath('./td/a/text()')[0] p.add_link(value) elif 'Telephone' in contact_type: p.add_contact('voice', value, 'legislature') elif 'Fax' in contact_type: p.add_contact('fax', value, 'legislature') elif 'Email' in contact_type: p.add_contact('email', value) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//ul[@class="subNav top"]/li/ul//li/a[contains(text(), "Councillor")] | //ul[@class="subNav top"]/li/ul//li/a[contains(text(), "Mayor")]') assert len(councillors), 'No councillors found' for councillor in councillors: name = councillor.text_content() url = councillor.attrib['href'] page = self.lxmlize(url) if councillor == councillors[0]: district = 'Ajax' role = 'Mayor' else: district = re.findall(r'Ward.*', page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content())[0].strip() role = page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content() role = re.search('((?:Regional )?Councillor)', role).group(1) name = name.replace(role, '').strip() p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('//div[@class="intQuicklinksPhoto"]//img/@src')[0] contact_info = page.xpath('//table[@class="datatable"][1]//tr')[1:] for line in contact_info: contact_type = line.xpath('./td')[0].text_content().strip() if re.match(r'(Home)|(Cell)|(Phone)|(Fax)|(Email)', contact_type): contact = line.xpath('./td')[1].text_content().strip() contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type] p.add_contact(contact_type, contact, '' if contact_type == 'email' else 'legislature') elif contact_type == 'Address': contact = ''.join(line.xpath('./td[2]//text()')).strip() p.add_contact(contact_type, contact, 'legislature') else: contact = line.xpath('./td[2]/a/@href')[0] p.add_link(contact) yield p
def scrape(self): regional_councillor_seat_number = 1 page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="WebPartWPQ3"]//ul[@class="dfwp-list"][1]/li/div/div/a') assert len(councillors), 'No councillors found' for councillor in councillors: url = councillor.attrib['href'] page = self.lxmlize(url) title = page.xpath('//div[@class="PL_Title"]')[0].text_content() if "Councillor" in title: district, name = re.split(r'Councillor', title) role = 'Councillor' if "Regional" in district: role = 'Regional Councillor' district = "Vaughan (seat {})".format(regional_councillor_seat_number) regional_councillor_seat_number += 1 else: name = re.search(r'Mayor ([^,]+)', page.xpath('//meta[@name="keywords"]/@content')[0]).group(1) district = 'Vaughan' role = 'Mayor' name = name.strip() if role == 'Mayor': detail = self.lxmlize(page.xpath('//a[contains(@href,"/Contact-the-Mayor")]/@href')[0]) contact_info = detail.xpath('//div[@id="ctl00_PlaceHolderMain_RichHtmlField1__ControlWrapper_RichHtmlField"]')[0] else: contact_node = page.xpath('//div[@id="WebPartWPQ2"][contains(., "Phone")]') if contact_node: contact_info = contact_node[0] else: contact_info = page.xpath('//div[@id="WebPartWPQ3"]')[0] phone = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4} ext\. [0-9]{4}', contact_info.text_content())[0].replace('ext. ', 'x') fax = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4}', contact_info.text_content())[1] email = self.get_email(contact_info) p = Person(primary_org='legislature', name=name, district=district.strip(), role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email) image = page.xpath('//img[contains(@alt, "Councillor")]/@src') if image: p.image = image[0] if page.xpath('.//a[contains(@href,"facebook")]'): p.add_link(page.xpath('.//a[contains(@href,"facebook")]')[0].attrib['href']) if page.xpath('.//a[contains(@href,"twitter")]'): p.add_link(page.xpath('.//a[contains(@href,"twitter")]')[0].attrib['href']) if page.xpath('.//a[contains(@href,"youtube")]'): p.add_link(page.xpath('.//a[contains(@href, "youtube")]')[0].attrib['href']) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE, user_agent=CUSTOM_USER_AGENT) councillors = page.xpath( '//section[contains(@class,"avia-team-member")]')[:-1] assert len(councillors), 'No councillors found' for councillor in councillors: name = councillor.xpath('.//h3/text()')[0] if councillor.xpath( './/div[contains(@class,"team-member-job-title")][contains(.,"Maire")]/text()' ): role = 'Maire' district = 'Côte-Saint-Luc' else: role, district = councillor.xpath( './/div[contains(@class,"team-member-job-title")]/text()' )[0].split(',', 1) if role == 'Conseillère': role = 'Conseiller' image = councillor.xpath('.//img/@src')[0] twitter = councillor.xpath('.//p[contains(.,"Twitter")]/a/text()') web = councillor.xpath('.//p[contains(.,"Web")]/a/@href') blog = councillor.xpath('.//p[contains(.,"Blog")]/a/@href') p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('email', self.get_email(councillor)) p.add_contact('voice', self.get_phone(councillor, area_codes=[514]), 'legislature') p.image = image if twitter: p.add_link(twitter[0]) if web: p.add_link(web[0]) if blog: p.add_link(blog[0]) yield p
def scrape_people(self, rows, gender): assert len(rows), 'No members found' for row in rows: name = row.xpath( './/div[@class="ce-mip-mp-name"][1]')[0].text_content() constituency = row.xpath( './/div[@class="ce-mip-mp-constituency"][1]')[0].text_content( ) constituency = constituency.replace('–', '—') # n-dash, m-dash if constituency == 'Mont-Royal': constituency = 'Mount Royal' province = row.xpath( './/div[@class="ce-mip-mp-province"][1]')[0].text_content() party = row.xpath( './/div[@class="ce-mip-mp-party"][1]')[0].text_content() url = row.xpath('.//a[@class="ce-mip-mp-tile"]/@href')[0] if province == 'Québec': url = url.replace('/en/', '/fr/') mp_page = self.lxmlize(url) email = self.get_email(mp_page, '//*[@id="contact"]/div/p/a', error=False) photo = mp_page.xpath( './/div[@class="ce-mip-mp-profile-container"]//img/@src')[0] m = Person(primary_org='lower', name=name, district=constituency, role='MP', party=party) m.add_source(COUNCIL_PAGE) m.add_source(url) m.gender = gender # @see https://www.ourcommons.ca/Members/en/ziad-aboultaif(89156) if email: m.add_contact('email', email) if photo: # Determine whether the photo is actually a generic silhouette photo_response = self.get(photo) if (photo_response.status_code == 200 and hashlib.sha1(photo_response.content).hexdigest() not in IMAGE_PLACEHOLDER_SHA1): m.image = photo # I don't think the new parliment website has personal website anymore personal_url = mp_page.xpath( './/a[contains(@title, "Personal Web Site")]/@href') if personal_url: m.add_link(personal_url[0]) preferred_languages = mp_page.xpath( './/dt[contains(., "Preferred Language")]/following-sibling::dd/text()' ) if preferred_languages: m.extras['preferred_languages'] = [ language.replace('/', '').strip() for language in preferred_languages ] if province == 'Québec': m.add_contact('address', 'Chambre des communes\nOttawa ON K1A 0A6', 'legislature') else: m.add_contact('address', 'House of Commons\nOttawa ON K1A 0A6', 'legislature') # Hill Office contacts # Now phone and fax are in the same element # <p> # Telephone: xxx-xxx-xxxx<br/> # Fax: xxx-xxx-xxx # </p> phone_and_fax_el = mp_page.xpath( './/h4[contains(., "Hill Office")]/../p[contains(., "Telephone")]|.//h4[contains(., "Hill Office")]/../p[contains(., "Téléphone :")]' ) if len(phone_and_fax_el): phone_and_fax = phone_and_fax_el[0].text_content().strip( ).splitlines() voice = phone_and_fax[0].replace('Telephone:', '').replace( 'Téléphone :', '').strip() fax = phone_and_fax[1].replace('Fax:', '').replace( 'Télécopieur :', '').strip() if voice: m.add_contact('voice', voice, 'legislature') if fax: m.add_contact('fax', fax, 'legislature') # Constituency Office contacts # Some people has more than one, e.g. https://www.ourcommons.ca/Members/en/ben-lobb(35600)#contact for i, constituency_office_el in enumerate( mp_page.xpath( './/div[@class="ce-mip-contact-constituency-office-container"]/div' )): note = 'constituency' if i: note += ' ({})'.format(i + 1) address = constituency_office_el.xpath('./p[1]')[0] address = address.text_content().strip().splitlines() address = list(map(str.strip, address)) m.add_contact('address', '\n'.join(address), note) phone_and_fax_el = constituency_office_el.xpath( './p[contains(., "Telephone")]|./p[contains(., "Téléphone")]' ) if len(phone_and_fax_el): phone_and_fax = phone_and_fax_el[0].text_content().strip( ).splitlines() # Note that https://www.ourcommons.ca/Members/en/michael-barrett(102275)#contact # has a empty value - "Telephone:". So the search / replace cannot include space. voice = phone_and_fax[0].replace('Telephone:', '').replace( 'Téléphone :', '').strip() if len(phone_and_fax) > 1: fax = phone_and_fax[1].replace('Fax:', '').replace( 'Télécopieur :', '').strip() if voice: m.add_contact('voice', voice, note) if fax: m.add_contact('fax', fax, note) yield m
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) types = page.xpath( '//div[@class="bluearrow shaded bottomborder "][1]/ul/li/a/@href' )[:4] for org_type, link in enumerate(types): page = self.lxmlize(link) district_urls = page.xpath( '//div[@class="parbase list section cplist"]/table/tr/td[1]/b/a/@href' ) for district_url in district_urls: page = self.lxmlize(district_url) district = page.xpath('//div[@class="pageHeader"]/h1/text()' )[0].split(' - ')[1].strip() org = Organization( name=district + org_types[org_type], classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(district_url) yield org address = ', '.join( page.xpath('//div[@class="left_contents"]/p[1]/text()')) contacts = page.xpath( '//div[@class="left_contents"]/p[b[text() = "Contact"]]/text()' ) phone = contacts[0].split(':')[1].strip().replace(' ', '-') fax = contacts[1].split(':')[1].strip().replace(' ', '-') email = self.get_email(page, '//div[@class="left_contents"]') site = page.xpath( '//div[@class="left_contents"]//a[not(contains(@href,"mailto:"))]' ) if site: site = site[0].text_content() councillors = page.xpath( '//div[@class="right_contents"]//p/text()') for i, councillor in enumerate(councillors): if 'Vacant' in councillor: continue p = Person(primary_org='legislature', name=councillor, district=district) p.add_source(COUNCIL_PAGE) p.add_source(link) p.add_source(district_url) if i == 0: membership = p.add_membership(org, role='Mayor') else: membership = p.add_membership(org, role='Councillor') membership.post_id = district membership.add_contact_detail('address', address, 'legislature') if phone: membership.add_contact_detail('voice', phone, 'legislature') if fax: membership.add_contact_detail('fax', fax, 'legislature') if email: membership.add_contact_detail('email', email) if site: p.add_link(site) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) members = page.xpath('//table[@id="MLAs"]//tr')[1:] assert len(members), 'No members found' for member in members: if 'Vacant' not in member.xpath('./td')[0].text_content(): name = member.xpath('./td')[0].text_content().split('. ', 1)[1] party = member.xpath('./td')[1].text district = member.xpath('./td')[2].text_content() url = member.xpath('./td[1]/a/@href')[0] page = self.lxmlize(url) p = Person(primary_org='legislature', name=name, district=district, role='MLA', party=party) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath( '//div[contains(@class, "mla-image-cell")]/img/@src')[0] contact = page.xpath('//div[@id="mla-contact"]/div[2]')[0] website = contact.xpath('./div[3]/div[3]/div[2]/a') if website: p.add_link(website[0].text_content()) def handle_address(lines, address_type): address_lines = [] for line in lines: if line.endswith(':'): # Room:, Phone:, Fax: break address_lines.append(line) if address_lines: p.add_contact( 'address', ' '.join(address_lines), address_type, ) def handle_phone(lines, phone_type): if 'Phone:' in lines: next_line = lines[lines.index('Phone:') + 1] if next_line.endswith(':'): return number = None if '/' in next_line: for fragment in next_line.split('/'): if fragment.strip().startswith('306-'): number = fragment.strip() break else: number = next_line p.add_contact('voice', number, phone_type, area_code=306) legislature_lines = contact.xpath( './/div[@class="col-md-4"][1]/div//text()') assert (legislature_lines[0] == 'Legislative Building Address') handle_address(legislature_lines[1:], 'legislature') handle_phone(legislature_lines[1:], 'legislature') constituency_lines = contact.xpath( './/div[@class="col-md-4"][2]/div//text()') assert (constituency_lines[0] == 'Constituency Address') handle_address(constituency_lines[1:], 'constituency') handle_phone(constituency_lines[1:], 'constituency') email = self.get_email(contact, error=False) if email: p.add_contact('email', email) yield p
def scrape(self): exclude_divisions = {} exclude_districts = { 'Capital', 'Capital F', 'Capital G', 'Capital H', 'Central Coast B', 'Central Okanagan East', 'Central Okanagan West', 'Comox Valley B', 'Comox Valley C', 'Islands Trust', 'Kitimat-Stikine C', 'Kootenay Boundary B', 'Kootenay Boundary C', 'Kootenay Boundary D', 'Kootenay Boundary E', 'Metro Vancouver A', 'North Coast A', 'North Coast C', 'North Coast D', 'North Coast E', 'Okanagan-Similkameen I', 'Okanagan-Similkameen Olalla Local Community Commission', 'Qathet A', 'Qathet B', 'Qathet C', 'Qathet D', 'Qathet E', } expected_roles = { 'candidate', } infixes = { 'CY': 'City', 'DM': 'District', 'IGD': 'District', 'IM': 'Municipal', 'RGM': 'Regional', 'T': 'Town', 'VL': 'Village', 'RDA': 'District', } duplicate_names = { 'Rick Smith', 'Sung Y Wong', 'Elizabeth Taylor', } names_to_ids = {} for division in Division.get('ocd-division/country:ca').children( 'csd'): type_id = division.id.rsplit(':', 1)[1] if type_id.startswith('59'): if division.attrs['classification'] == 'IRI': continue if division.name in names_to_ids: names_to_ids[division.name] = None else: names_to_ids[division.name] = division.id reader = self.csv_reader(COUNCIL_PAGE, header=True) reader.fieldnames = [field.lower() for field in reader.fieldnames] organizations = {} birth_date = 1900 seen = set() for row in reader: name = row['full name'] district_name = row['district name'] if not any(row.values()) or name.lower() in ( '', 'vacant') or district_name in exclude_districts: continue if row['district id']: division_id = 'ocd-division/country:ca/csd:{}'.format( row['district id']) else: division_id = names_to_ids[row['district name']] if division_id in exclude_divisions: continue if not division_id: raise Exception('unhandled collision: {}'.format( row['district name'])) division = Division.get(division_id) division_name = division.name organization_name = '{} {} Council'.format( division_name, infixes[division.attrs['classification']]) if division_id not in seen: seen.add(division_id) organizations[division_id] = Organization( name=organization_name, classification='government') organizations[division_id].add_source(COUNCIL_PAGE) organization = organizations[division_id] role = row['primary role'] if role not in expected_roles: raise Exception('unexpected role: {}'.format(role)) if row['district id']: district = format(division_id) else: district = division_name organization.add_post(role=role, label=district, division_id=division_id) p = Person(primary_org='government', primary_org_name=organization_name, name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) if row['source url']: p.add_source(row['source url']) if name in duplicate_names: p.birth_date = str(birth_date) birth_date += 1 if row['email']: p.add_contact('email', row['email']) if row['phone']: p.add_contact('voice', row['phone'], 'legislature') if row['twitter']: p.add_link(row['twitter']) p._related[0].extras[ 'boundary_url'] = '/boundaries/census-subdivisions/{}/'.format( division_id.rsplit(':', 1)[1]) yield p for organization in organizations.values(): yield organization
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) members = page.xpath('//*[@id="ListeDeputes"]/tbody/tr') assert len(members), 'No members found' for row in members: name_comma, division = [cell.text_content() for cell in row[:2]] name = ' '.join(reversed(name_comma.strip().split(','))) division = division.replace('–', '-') # n-dash, hyphen party = row[2].text_content().strip() if party == 'Indépendante': party = 'Indépendant' email = self.get_email(row[3], error=False) detail_url = row[0][0].attrib['href'] detail_page = self.lxmlize(detail_url) contact_url = detail_url.replace('index.html', 'coordonnees.html') contact_page = self.lxmlize(contact_url) photo_url = detail_page.xpath('//img[@class="photoDepute"]/@src') p = Person(primary_org='legislature', name=name, district=division, role='MNA', party=party) p.add_source(COUNCIL_PAGE) p.add_source(detail_url) if photo_url: p.image = photo_url[0] if email: p.add_contact('email', email) identifier = re.search(r'/([^/]+)/index.html', detail_url).group(1) facebook, twitter = SOCIAL_MEDIA_DATA.get(identifier, ('', '')) if facebook: p.add_link(facebook) if twitter: p.add_link(twitter) for div in contact_page.xpath( '//div[@class="blockAdresseDepute"]'): try: phone = self.get_phone(div) heading = div.find('h3').text except Exception: pass # probably just no phone number present else: try: note = { 'Circonscription': 'constituency', 'Parlement': 'legislature', 'Ministère': 'legislature', }[heading] except KeyError: raise # scraper should be updated to handle new value else: p.add_contact('voice', phone, note) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) rows = page.xpath('//div[@class="content-primary"]//tr')[1:] assert len(rows), 'No members found' for row in rows: name_cell = row.xpath('./td[1]')[0] last_name = name_cell.xpath('.//span[1]//text()')[0] first_name = name_cell.xpath('.//span[2]//text()')[0] name = '{} {}'.format(first_name, last_name) constituency = row.xpath('./td[2]//text()')[0].replace('–', '—') # n-dash, m-dash if constituency == 'Mont-Royal': constituency = 'Mount Royal' province = row.xpath('./td[3]//text()')[0] party = row.xpath('string(./td[4])') # allow string() url = name_cell.xpath('.//a/@href')[0] if province == 'Québec': url = url.replace('/en/', '/fr/') mp_page = self.lxmlize(url) email = self.get_email(mp_page, '//span[@class="caucus"]', error=False) photo = mp_page.xpath('//div[@class="profile overview header"]//img/@src')[0] m = Person(primary_org='lower', name=name, district=constituency, role='MP', party=party) m.add_source(COUNCIL_PAGE) m.add_source(url) # @see http://www.parl.gc.ca/Parliamentarians/en/members/David-Yurdiga%2886260%29 if email: m.add_contact('email', email) elif name == 'Adam Vaughan': m.add_contact('email', '*****@*****.**') if photo: # Determine whether the photo is actually a generic silhouette photo_response = self.get(photo) if (photo_response.status_code == 200 and hashlib.sha1(photo_response.content).hexdigest() not in IMAGE_PLACEHOLDER_SHA1): m.image = photo personal_url = mp_page.xpath('//a[contains(@title, "Personal Web Site")]/@href') if personal_url: m.add_link(personal_url[0]) preferred_languages = mp_page.xpath('//span[@class="label"][contains(., "Preferred Language")]/following-sibling::span[@class="constituency"]/text()') if preferred_languages: m.extras['preferred_languages'] = [language.replace('/', '').strip() for language in preferred_languages] if province == 'Québec': m.add_contact('address', 'Chambre des communes\nOttawa ON K1A 0A6', 'legislature') else: m.add_contact('address', 'House of Commons\nOttawa ON K1A 0A6', 'legislature') voice = mp_page.xpath('//div[@class="hilloffice"]//span//text()[contains(., "Telephone:")]|//div[@class="hilloffice"]//span//text()[contains(., "Téléphone :")]')[0].replace('Telephone: ', '').replace('Téléphone : ', '') if voice: m.add_contact('voice', voice, 'legislature') fax = mp_page.xpath('//div[@class="hilloffice"]//span//text()[contains(., "Fax:")]|//div[@class="hilloffice"]//span//text()[contains(., "Télécopieur :")]')[0].replace('Fax: ', '').replace('Télécopieur : ', '') if fax: m.add_contact('fax', fax, 'legislature') for i, li in enumerate(mp_page.xpath('//div[@class="constituencyoffices"]//li')): spans = li.xpath('./span[not(@class="spacer")]') note = 'constituency' if i: note += ' ({})'.format(i + 1) m.add_contact('address', '\n'.join([ spans[0].text_content(), # address line 1 spans[1].text_content(), # address line 2 spans[2].text_content(), # city, region spans[3].text_content(), # postal code ]), note) voice = li.xpath('./span//text()[contains(., "Telephone:")]|./span//text()[contains(., "Téléphone :")]') if voice: voice = voice[0].replace('Telephone: ', '').replace('Téléphone : ', '') if voice: m.add_contact('voice', voice, note) fax = li.xpath('./span//text()[contains(., "Fax:")]|./span//text()[contains(., "Télécopieur :")]') if fax: fax = fax[0].replace('Fax: ', '').replace('Télécopieur : ', '') if fax: m.add_contact('fax', fax, note) yield m
def scrape(self): exclude_divisions = { 'ocd-division/country:ca/csd:1301006', # Saint John 'ocd-division/country:ca/csd:1307022', # Moncton 'ocd-division/country:ca/csd:1310032', # Fredericton } expected_roles = { 'Mayor', 'Councillor', } unique_roles = { 'Mayor', } classifications = { 'Cities': 'City', 'Towns': 'Town', 'Villages': 'Village', 'Rural Communities': 'Community', 'Regional Municipality': 'Regional', } corrections = { 'Beaubassin-est/East': 'Beaubassin East', 'Lac-Baker': 'Lac Baker', 'Saint-François-de-Madawaska': 'Saint-François de Madawaska', 'Saint-Hilaire': 'Saint Hilaire', } unknown_names = { 'Haut-Madawaska', # incorporated after Census 2016 } duplicate_names = { 'Denis Savoie', 'Josée Levesque', 'Luc Levesque', } names_to_ids = {} for division in Division.get('ocd-division/country:ca').children( 'csd'): type_id = division.id.rsplit(':', 1)[1] if type_id.startswith('13'): if division.attrs['classification'] == 'P': continue if division.name in names_to_ids: raise Exception('unhandled collision: {}'.format( division.name)) else: names_to_ids[division.name] = division.id page = self.lxmlize(COUNCIL_PAGE) list_links = page.xpath( '//div[@id="sidebar"]//div[contains(@class, "list")][1]//a') birth_date = 1900 seen = set() assert len(list_links), 'No list items found' for list_link in list_links: page = self.lxmlize(list_link.attrib['href']) detail_urls = page.xpath('//td[1]//@href') assert len(detail_urls), 'No municipalities found' for detail_url in detail_urls: page = self.lxmlize(detail_url, encoding='utf-8') division_name = re.sub( r'\ASt\b\.?', 'Saint', page.xpath('//h1/text()')[0].split(' - ', 1)[1]) division_name = corrections.get(division_name, division_name) if division_name in unknown_names: continue division_id = names_to_ids[division_name] if division_id in exclude_divisions: continue if division_id in seen: raise Exception( 'unhandled collision: {}'.format(division_id)) seen.add(division_id) division_name = Division.get(division_id).name organization_name = '{} {} Council'.format( division_name, classifications[list_link.text]) organization = Organization(name=organization_name, classification='government') organization.add_source(detail_url) address = ', '.join( page.xpath('//div[@class="left_contents"]/p[1]/text()')) contacts = page.xpath( '//div[@class="left_contents"]/p[contains(., "Contact")]/text()' ) phone = contacts[0].split(':')[1] if len(contacts) > 1: fax = contacts[1].split(':')[1] email = self.get_email(page, '//div[@class="left_contents"]', error=False) url = page.xpath( '//div[@class="left_contents"]//@href[not(contains(., "mailto:"))]' ) if url: url = url[0] groups = page.xpath( '//div[contains(@class, "right_contents")]/p') assert len(groups), 'No groups found' for p in groups: role = p.xpath('./b/text()')[0].rstrip('s') if role not in expected_roles: raise Exception('unexpected role: {}'.format(role)) councillors = p.xpath('./text()') assert len(councillors), 'No councillors found' for seat_number, name in enumerate(councillors, 1): if 'vacant' in name.lower(): continue if role in unique_roles: district = division_name else: district = '{} (seat {})'.format( division_name, seat_number) organization.add_post(role=role, label=district, division_id=division_id) p = Person(primary_org='government', primary_org_name=organization_name, name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(list_link.attrib['href']) p.add_source(detail_url) if name in duplicate_names: p.birth_date = str(birth_date) birth_date += 1 p.add_contact('address', address, 'legislature') # @see https://en.wikipedia.org/wiki/Area_code_506 if phone: p.add_contact('voice', phone, 'legislature', area_code=506) if fax: p.add_contact('fax', fax, 'legislature', area_code=506) if email: p.add_contact('email', email) if url: p.add_link(url) p._related[0].extras[ 'boundary_url'] = '/boundaries/census-subdivisions/{}/'.format( division_id.rsplit(':', 1)[1]) yield p yield organization
def scrape(self): exclude_divisions = { } exclude_districts = { 'Capital', 'Capital F', 'Capital G', 'Capital H', 'Central Coast B', 'Central Okanagan East', 'Central Okanagan West', 'Comox Valley B', 'Comox Valley C', 'Islands Trust', 'Kitimat-Stikine C', 'Kootenay Boundary B', 'Kootenay Boundary C', 'Kootenay Boundary D', 'Kootenay Boundary E', 'Metro Vancouver A', 'North Coast A', 'North Coast C', 'North Coast D', 'North Coast E', 'Okanagan-Similkameen I', 'Okanagan-Similkameen Olalla Local Community Commission', 'Qathet A', 'Qathet B', 'Qathet C', 'Qathet D', 'Qathet E', } expected_roles = { 'candidate', } infixes = { 'CY': 'City', 'DM': 'District', 'IGD': 'District', 'IM': 'Municipal', 'RGM': 'Regional', 'T': 'Town', 'VL': 'Village', 'RDA': 'District', } duplicate_names = { 'Rick Smith', 'Sung Y Wong', 'Elizabeth Taylor', } names_to_ids = {} for division in Division.get('ocd-division/country:ca').children('csd'): type_id = division.id.rsplit(':', 1)[1] if type_id.startswith('59'): if division.attrs['classification'] == 'IRI': continue if division.name in names_to_ids: names_to_ids[division.name] = None else: names_to_ids[division.name] = division.id reader = self.csv_reader(COUNCIL_PAGE, header=True) reader.fieldnames = [field.lower() for field in reader.fieldnames] organizations = {} birth_date = 1900 seen = set() rows = [row for row in reader] assert len(rows), 'No councillors found' for row in rows: name = row['full name'] district_name = row['district name'] if not any(row.values()) or name.lower() in ('', 'vacant') or district_name in exclude_districts: continue if row['district id']: division_id = 'ocd-division/country:ca/csd:{}'.format(row['district id']) else: division_id = names_to_ids[row['district name']] if division_id in exclude_divisions: continue if not division_id: raise Exception('unhandled collision: {}'.format(row['district name'])) division = Division.get(division_id) division_name = division.name organization_name = '{} {} Council'.format(division_name, infixes[division.attrs['classification']]) if division_id not in seen: seen.add(division_id) organizations[division_id] = Organization(name=organization_name, classification='government') organizations[division_id].add_source(COUNCIL_PAGE) organization = organizations[division_id] role = row['primary role'] if role not in expected_roles: raise Exception('unexpected role: {}'.format(role)) if row['district id']: district = format(division_id) else: district = division_name organization.add_post(role=role, label=district, division_id=division_id) p = Person(primary_org='government', primary_org_name=organization_name, name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) if row['source url']: p.add_source(row['source url']) if name in duplicate_names: p.birth_date = str(birth_date) birth_date += 1 if row['email']: p.add_contact('email', row['email']) if row['phone']: p.add_contact('voice', row['phone'], 'legislature') if row['twitter']: p.add_link(row['twitter']) p._related[0].extras['boundary_url'] = '/boundaries/census-subdivisions/{}/'.format(division_id.rsplit(':', 1)[1]) yield p for organization in organizations.values(): yield organization
def scrape(self): response = urlopen(COUNCIL_PAGE).read() pdf = open('/tmp/yt.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output( ['pdftotext', '-layout', '/tmp/yt.pdf', '-']) data = re.split(r'\n\s*\n', data) for municipality in data: if 'Councillors' not in municipality: continue lines = municipality.split('\n') if 'Page' in lines[0]: lines.pop(0) if not lines[0].strip(): lines.pop(0) col1end = re.search(r'\s{2,}(\w)', lines[0].strip()).end() col2end = re.search(r':\s{2,}(\w)', lines[0].strip()).end() if 'Council' in lines[1]: address = lines[2][:col1end - 1].strip() + ' ' + lines[3][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() + ' ' + lines[1][:col1end - 1].strip() else: address = lines[1][:col1end - 1].strip() + ' ' + lines[2][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() organization = Organization( name=district + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) organization.add_source(COUNCIL_PAGE) yield organization phone = re.findall(r'(?<=Phone: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') email = re.findall(r'(?<=E-mail:) (\S*)', municipality)[0] fax = None if 'Fax' in municipality: fax = re.findall(r'(?<=Fax: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') website = None if 'Website' in municipality: website = re.findall(r'((http:\/\/|www.)(\S*))', municipality)[0][0] councillor_or_mayor = False for line in lines: if 'Mayor:' in line: councillor_or_mayor = True role = 'Mayor' continue if 'Councillors' in line: councillor_or_mayor = True role = 'Councillor' continue if councillor_or_mayor: councillor = line[col1end - 1:col2end - 1].strip() if not councillor: continue p = Person(primary_org='legislature', name=councillor, district=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(organization, role=role, district=district) membership.add_contact_detail('address', address, 'legislature') membership.add_contact_detail('voice', phone, 'legislature') membership.add_contact_detail('email', email) if fax: membership.add_contact_detail('fax', fax, 'legislature') if website: p.add_link(website) yield p os.system('rm /tmp/yt.pdf')
def scrape(self): organizations = {} seat_numbers = defaultdict(lambda: defaultdict(int)) reader = self.csv_reader(self.csv_url, delimiter=self.delimiter, header=True, encoding=self.encoding, skip_rows=self.skip_rows) reader.fieldnames = [ self.header_converter(field) for field in reader.fieldnames ] for row in reader: try: if self.is_valid_row(row): for key, corrections in self.corrections.items(): if not isinstance(corrections, dict): row[key] = corrections(row[key]) elif row[key] in corrections: row[key] = corrections[row[key]] organization_classification = 'legislature' organization_name = row['organization'] organization_key = organization_name.lower() if organization_key in organizations: organization = organizations[organization_key] else: organization = Organization( organization_name, classification=organization_classification) organization.add_source(self.csv_url) yield organization organizations[organization_key] = organization if not row['primary role']: row['primary role'] = 'Councillor' role = row['primary role'] post = Post(role=role, label=organization_name, organization_id=organization._id) yield post name = row['name'].strip(' .,') district = row['district name'] if self.many_posts_per_area and role not in self.unique_roles: seat_numbers[role][district] += 1 district = '{} (seat {})'.format( district, seat_numbers[role][district]) p = Person(primary_org=organization_classification, name=name, district=district, role=role, party=row.get('party name')) p.add_source(self.csv_url) if row.get('gender'): p.gender = row['gender'] if row.get('photo url'): p.image = row['photo url'] if row.get('source url'): p.add_source(row['source url'].strip(' .,')) if row.get('website'): p.add_link(row['website'], note='web site') if row.get('facebook'): p.add_link(re.sub(r'[#?].+', '', row['facebook'])) if row.get('twitter'): p.add_link(row['twitter']) if row['email']: p.add_contact('email', row['email'].strip(' .,')) if row['address']: p.add_contact('address', row['address'], 'legislature') if row.get('phone'): p.add_contact('voice', row['phone'], 'legislature') if row.get('fax'): p.add_contact('fax', row['fax'], 'legislature') if row.get('cell'): p.add_contact('cell', row['cell'], 'legislature') if row.get('birth date'): p.birth_date = row['birth date'] if row.get('incumbent'): p.extras['incumbent'] = row['incumbent'] if name in self.other_names: for other_name in self.other_names[name]: p.add_name(other_name) # Validate person entity so that we can catch the exception if needed. p.validate() yield p except Exception as e: print(repr(e)) continue
def scrape(self): response = urlopen(COUNCIL_PAGE).read() pdf = open('/tmp/yt.pdf', 'w') pdf.write(response) pdf.close() data = subprocess.check_output(['pdftotext', '-layout', '/tmp/yt.pdf', '-']) data = re.split(r'\n\s*\n', data) for municipality in data: if 'Councillors' not in municipality: continue lines = municipality.split('\n') if 'Page' in lines[0]: lines.pop(0) if not lines[0].strip(): lines.pop(0) col1end = re.search(r'\s{2,}(\w)', lines[0].strip()).end() col2end = re.search(r':\s{2,}(\w)', lines[0].strip()).end() if 'Council' in lines[1]: address = lines[2][:col1end - 1].strip() + ' ' + lines[3][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() + ' ' + lines[1][:col1end - 1].strip() else: address = lines[1][:col1end - 1].strip() + ' ' + lines[2][:col1end - 1].strip() district = lines[0][:col1end - 1].strip() organization = Organization(name=district + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) organization.add_source(COUNCIL_PAGE) yield organization phone = re.findall(r'(?<=Phone: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') email = re.findall(r'(?<=E-mail:) (\S*)', municipality)[0] fax = None if 'Fax' in municipality: fax = re.findall(r'(?<=Fax: )\(?(\d{3}[\)-] ?\d{3}-\d{4})', municipality)[0].replace(') ', '-') website = None if 'Website' in municipality: website = re.findall(r'((http:\/\/|www.)(\S*))', municipality)[0][0] councillor_or_mayor = False for line in lines: if 'Mayor:' in line: councillor_or_mayor = True role = 'Mayor' continue if 'Councillors' in line: councillor_or_mayor = True role = 'Councillor' continue if councillor_or_mayor: councillor = line[col1end - 1:col2end - 1].strip() if not councillor: continue p = Person(primary_org='legislature', name=councillor, district=district) p.add_source(COUNCIL_PAGE) membership = p.add_membership(organization, role=role, district=district) membership.add_contact_detail('address', address, 'legislature') membership.add_contact_detail('voice', phone, 'legislature') membership.add_contact_detail('email', email) if fax: membership.add_contact_detail('fax', fax, 'legislature') if website: p.add_link(website) yield p os.system('rm /tmp/yt.pdf')
def scrape(self): exclude_divisions = { 'ocd-division/country:ca/csd:1301006', # Saint John 'ocd-division/country:ca/csd:1307022', # Moncton 'ocd-division/country:ca/csd:1310032', # Fredericton } expected_roles = { 'Mayor', 'Councillor', } unique_roles = { 'Mayor', } classifications = { 'Cities': 'City', 'Towns': 'Town', 'Villages': 'Village', 'Rural Communities': 'Community', 'Regional Municipality': 'Regional', } corrections = { 'Beaubassin-est/East': 'Beaubassin East', 'Lac-Baker': 'Lac Baker', 'Saint-François-de-Madawaska': 'Saint-François de Madawaska', 'Saint-Hilaire': 'Saint Hilaire', } unknown_names = { 'Haut-Madawaska', # incorporated after Census 2016 } duplicate_names = { 'Denis Savoie', 'Josée Levesque', 'Luc Levesque', } names_to_ids = {} for division in Division.get('ocd-division/country:ca').children('csd'): type_id = division.id.rsplit(':', 1)[1] if type_id.startswith('13'): if division.attrs['classification'] == 'P': continue if division.name in names_to_ids: raise Exception('unhandled collision: {}'.format(division.name)) else: names_to_ids[division.name] = division.id page = self.lxmlize(COUNCIL_PAGE) list_links = page.xpath('//div[@id="sidebar"]//div[contains(@class, "list")][1]//a') birth_date = 1900 seen = set() assert len(list_links), 'No list items found' for list_link in list_links: page = self.lxmlize(list_link.attrib['href']) detail_urls = page.xpath('//td[1]//@href') assert len(detail_urls), 'No municipalities found' for detail_url in detail_urls: page = self.lxmlize(detail_url, encoding='utf-8') division_name = re.sub(r'\ASt\b\.?', 'Saint', page.xpath('//h1/text()')[0].split(' - ', 1)[1]) division_name = corrections.get(division_name, division_name) if division_name in unknown_names: continue division_id = names_to_ids[division_name] if division_id in exclude_divisions: continue if division_id in seen: raise Exception('unhandled collision: {}'.format(division_id)) seen.add(division_id) division_name = Division.get(division_id).name organization_name = '{} {} Council'.format(division_name, classifications[list_link.text]) organization = Organization(name=organization_name, classification='government') organization.add_source(detail_url) address = ', '.join(page.xpath('//div[@class="left_contents"]/p[1]/text()')) contacts = page.xpath('//div[@class="left_contents"]/p[contains(., "Contact")]/text()') phone = contacts[0].split(':')[1] if len(contacts) > 1: fax = contacts[1].split(':')[1] email = self.get_email(page, '//div[@class="left_contents"]', error=False) url = page.xpath('//div[@class="left_contents"]//@href[not(contains(., "mailto:"))]') if url: url = url[0] groups = page.xpath('//div[contains(@class, "right_contents")]/p') assert len(groups), 'No groups found' for p in groups: role = p.xpath('./b/text()')[0].rstrip('s') if role not in expected_roles: raise Exception('unexpected role: {}'.format(role)) councillors = p.xpath('./text()') assert len(councillors), 'No councillors found' for seat_number, name in enumerate(councillors, 1): if 'vacant' in name.lower(): continue if role in unique_roles: district = division_name else: district = '{} (seat {})'.format(division_name, seat_number) organization.add_post(role=role, label=district, division_id=division_id) p = Person(primary_org='government', primary_org_name=organization_name, name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(list_link.attrib['href']) p.add_source(detail_url) if name in duplicate_names: p.birth_date = str(birth_date) birth_date += 1 p.add_contact('address', address, 'legislature') # @see https://en.wikipedia.org/wiki/Area_code_506 if phone: p.add_contact('voice', phone, 'legislature', area_code=506) if fax: p.add_contact('fax', fax, 'legislature', area_code=506) if email: p.add_contact('email', email) if url: p.add_link(url) p._related[0].extras['boundary_url'] = '/boundaries/census-subdivisions/{}/'.format(division_id.rsplit(':', 1)[1]) yield p yield organization
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) districts = page.xpath( '//div[@id="left-content" or @id="right-content"]//a') for district in districts: url = district.attrib['href'] page = self.lxmlize(url) org = Organization( name=district.text_content() + ' Council', classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(url) yield org info = page.xpath('//div[@style="WIDTH:750"]/dl') for contact in info: contact_type = contact.xpath('./dt')[0].text_content() contact = contact.xpath('./dd')[0].text_content().replace( '(', '').replace(') ', '-') if 'Officials' in contact_type: break if 'Tel' in contact_type: phone = contact if 'Fac' in contact_type: fax = contact if 'Address' in contact_type: address = contact if 'Email' in contact_type: email = contact if 'Website' in contact_type: site = contact councillors = page.xpath( '//div[@style="WIDTH:750"]/dl/dt[contains(text(), "Elected Officials")]/parent::dl/dd/pre/text()' )[0].splitlines(True) for councillor in councillors: name = councillor.replace('(Mayor)', '').replace( '(Deputy Mayor)', '').replace('(Chairperson)', '').strip() role = re.sub(r'\(|\)', '', councillor.replace(name, '').strip()) if not role: role = 'Councillor' p = Person(primary_org='legislature', name=name, district=district.text_content()) p.add_source(COUNCIL_PAGE) p.add_source(url) membership = p.add_membership(org, role=role, district=district.text_content()) membership.add_contact_detail( 'voice', self.clean_telephone_number(phone), 'legislature') membership.add_contact_detail('fax', self.clean_telephone_number(fax), 'legislature') membership.add_contact_detail('address', self.clean_address(address), 'legislature') membership.add_contact_detail('email', email) if site: p.add_link(site) yield p
def scrape(self): page = self.lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="printArea"]//table//tr//td')[4:-1] yield self.scrape_mayor(councillors[0]) for councillor in councillors[1:]: name = ' '.join( councillor.xpath('.//strong/a[last()]//text()')[0].split()) infostr = councillor.xpath('.//strong//text()')[0] try: district = infostr.split('-')[1] role = 'Councillor' except IndexError: district = 'Newmarket' role = 'Regional Councillor' url = councillor.xpath('.//a/@href')[0] p = Person(primary_org='legislature', name=name, district=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = councillor.xpath('.//img/@src')[0] page = self.lxmlize(url) info = page.xpath('//div[@id="printArea"]')[0] info = info.xpath('.//p[@class="heading"][2]/following-sibling::p') address = info.pop(0).text_content().strip() if not address: address = info.pop(0).text_content().strip() if 'Ward' in info[0].text_content(): info.pop(0) numbers = info.pop(0).text_content().split(':') email = self.get_email(page) p.add_contact('email', email) for i, contact in enumerate(numbers): if i == 0: continue if '@' in contact: continue # executive assistant email else: number = re.findall(r'([0-9]{3}-[0-9]{3}-[0-9]{4})', contact)[0] ext = re.findall(r'(Ext\. [0-9]{3,4})', contact) if ext: number = number + ext[0].replace('Ext. ', ' x') contact_type = re.findall(r'[A-Za-z]+$', numbers[i - 1])[0] if 'Fax' in contact_type: p.add_contact('fax', number, 'legislature') elif 'Phone' in contact_type: p.add_contact('voice', number, 'legislature') else: p.add_contact(contact_type, number, contact_type) site = page.xpath('.//a[contains(text(), "http://")]') if site: p.add_link(site[0].text_content()) yield p