def get_people(self): page = lxmlize(COUNCIL_PAGE) councillor_links = page.xpath('//li[@id="pageid2117"]/ul/li/a')[2:10] for link in councillor_links: if not link.text.startswith('Councillor'): continue url = link.attrib['href'] page = lxmlize(url) mail_link = page.xpath('//a[@title]')[0] name = mail_link.attrib['title'] email = mail_link.attrib['href'][len('mailto:'):] photo_url = page.xpath( 'string(//div[@class="pageContent"]//img[@align="right"]/@src)' ) p = Legislator(name=name, post_id='Abbotsford', role='Councillor', image=photo_url) p.add_source(url) p.add_contact('email', email, None) yield p page = lxmlize(MAYOR_URL) name = page.xpath('string(//h1)').split(' ', 1)[1] photo_url = page.xpath('string(//img[@hspace=10]/@src)') # email is hidden behind a form p = Legislator(name=name, post_id='Abbotsford', role='Mayor', image=photo_url) p.add_source(MAYOR_URL) yield p
def get_people(self): # mayor first, can't find email page = lxmlize(MAYOR_URL) photo_url = page.xpath('string(//img/@src[contains(., "Maire")])') name = page.xpath('string(//td[@class="contenu"]/text()[last()])') p = Legislator(name=name, post_id=u"Trois-Rivières", role="Maire", image=photo_url) p.add_source(MAYOR_URL) yield p resp = requests.get(COUNCIL_PAGE) # page rendering through JS on the client page_re = re.compile(r'createItemNiv3.+"District (.+?)".+(index.+)\\"') for district, url_rel in page_re.findall(resp.text): if district not in ('des Estacades', 'des Plateaux', 'des Terrasses', 'du Sanctuaire'): district = re.sub('\A(?:de(?: la)?|des|du) ', '', district) url = urljoin(COUNCIL_PAGE, url_rel) page = lxmlize(url) name = page.xpath('string(//h2)') email = page.xpath( 'string(//a/@href[contains(., "mailto:")])')[len('mailto:'):] photo_url = page.xpath( 'string(//img/@src[contains(., "Conseiller")])') p = Legislator(name=name, post_id=district, role='Conseiller', image=photo_url) p.add_source(url) p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE, 'iso-8859-1') nodes = page.xpath('//table[@width="484"]//tr') try: for district_row, councillor_row, contact_row, _ in chunks(nodes, 4): post_id = district_row.xpath('string(.//strong)') name = councillor_row.xpath('string(.)')[len('Councillor '):] # TODO: phone numbers on site don't include area code. Add manually? #phone = contact_row.xpath('string(td[2]/text())') email = contact_row.xpath('string(td[4]/a)').replace('[at]', '@') p = Legislator(name=name, post_id=post_id, role='Councillor') p.add_source(COUNCIL_PAGE) #p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) yield p except ValueError: # on the last run through, there will be less than 4 rows to unpack pass mayor_page = lxmlize(MAYOR_PAGE, 'iso-8859-1') name = mayor_page.xpath('string(//h1[contains(., "Bio")])')[:-len(' Bio')] contact_page = lxmlize(MAYOR_CONTACT_URL, 'iso-8859-1') email = contact_page.xpath('string(//a[contains(., "@")][1])') p = Legislator(name=name, post_id='Halifax', role='Councillor') p.add_source(MAYOR_PAGE) p.add_source(MAYOR_CONTACT_URL) p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//p[@class="WSIndent"]/a') for councillor in councillors: district = re.findall(r'(Ward [0-9]{1,2})', councillor.text_content()) if district: district = district[0] name = councillor.text_content().replace(district, '').strip() role = 'Councillor' else: district = 'Kawartha Lakes' name = councillor.text_content().replace('Mayor', '').strip() role = 'Mayor' url = councillor.attrib['href'] page = lxmlize(url) email = page.xpath('//a[contains(@href, "mailto:")]/@href')[0].rsplit(':', 1)[1].strip() image = page.xpath('//img[@class="image-right"]/@src')[0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email, None) p.image = image yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="subnav"]//a') for councillor in councillors: name = councillor.xpath('./span/text()')[0].strip() district = councillor.xpath('.//strong')[0].text_content() url = councillor.attrib['href'] if councillor == councillors[0]: yield self.scrape_mayor(name, url) continue page = lxmlize(url) address = page.xpath('//div[@id="content"]//p[contains(text(),"City of Burlington,")]') contact = page.xpath('//div[@id="subnav"]//p[contains(text(),"Phone")]')[0] phone = re.findall(r'Phone: (.*)', contact.text_content())[0].replace('Ext. ', 'x').replace('#', 'x') fax = re.findall(r'Fax: (.*)', contact.text_content())[0] email = contact.xpath('//a[contains(@href, "mailto:")]')[0].text_content() p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('//div[@id="subnav"]//img/@src')[0] if address: p.add_contact('address', address[0].text_content(), 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//p[@class="WSIndent"]/a') for councillor in councillors: district = re.findall(r'(Ward [0-9]{1,2})', councillor.text_content()) if district: district = district[0] name = councillor.text_content().replace(district, '').strip() role = 'Councillor' else: district = 'Kawartha Lakes' name = councillor.text_content().replace('Mayor', '').strip() role = 'Mayor' url = councillor.attrib['href'] page = lxmlize(url) email = page.xpath( '//a[contains(@href, "mailto:")]/@href')[0].rsplit( ':', 1)[1].strip() image = page.xpath('//img[@class="image-right"]/@src')[0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email, None) p.image = image yield p
def scrape_mayor(self, name, url): page = lxmlize(url) contact = page.xpath('//div[@id="secondary align_RightSideBar"]/blockquote/p/text()') phone = contact[0] fax = contact[1] email = page.xpath('//div[@id="secondary align_RightSideBar"]/blockquote/p/a[contains(@href, "mailto:")]/text()')[0] mayor_page = lxmlize('http://www.burlingtonmayor.com') contact_url = mayor_page.xpath('//div[@class="menu"]//a[contains(text(),"Contact")]')[0].attrib['href'] mayor_page = lxmlize(contact_url) address = mayor_page.xpath('//div[@class="entry-content"]//p[contains(text(),"City Hall")]')[0].text_content() p = Legislator(name=name, post_id="Burlington", role='Mayor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_source('http://www.burlingtonmayor.com') p.image = page.xpath('//div[@id="secondary align_RightSideBar"]/p/img/@src')[0] p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) p.add_contact('address', address, 'legislature') return p
def scrape_mayor(self, url): page = lxmlize(url) name = page.xpath("//h1/text()")[0].replace("Toronto Mayor", "").strip() p = Legislator(name, post_id="Toronto", role='Mayor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('string(//article/img/@src)') url = page.xpath( '//a[contains(text(), "Contact the Mayor")]')[0].attrib['href'] url = url.replace( 'www.', 'www1.' ) # @todo fix lxmlize to use the redirected URL to make links absolute p.add_source(url) page = lxmlize(url) mail_elem, phone_elem = page.xpath('//h3')[:2] address = ''.join(mail_elem.xpath('./following-sibling::p//text()')) phone = phone_elem.xpath('string(./following-sibling::p[1])') p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') return p
def get_people(self): member_parties = dict(process_parties(lxmlize(PARTY_PAGE))) page = lxmlize(COUNCIL_PAGE) for row in page.xpath('//table[not(@id="footer")]/tr')[1:]: name, district, _, email = [ cell.xpath('string(.)').replace(u'\xa0', u' ') for cell in row ] phone = row[2].xpath('string(text()[1])') try: photo_page_url = row[0].xpath('./a/@href')[0] except IndexError: continue # there is a vacant district photo_page = lxmlize(photo_page_url) photo_url = photo_page.xpath('string(//table//img/@src)') district = district.replace(' - ', u'—') # m-dash party = get_party(member_parties[name.strip()]) p = Legislator(name=name, post_id=district, role='MHA', party=party, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_source(photo_page_url) p.add_contact('email', email, None) # TODO: either fix phone regex or tweak phone value p.add_contact('voice', phone, 'legislature') yield p
def get_people(self): yield mayor_info(MAYOR_PAGE) page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="news"]//p') for councillor in councillors: district = councillor.xpath('./b')[0].text_content() district = re.findall(u'(?:W|R).*', district)[0] role = 'Councillor' if 'Regional' in district: district = 'Cambridge' role = 'Regional Councillor' name = councillor.xpath('.//a')[0].text_content() url = councillor.xpath('.//a')[0].attrib['href'] page = lxmlize(url) image = page.xpath('//img[contains(@src, "councilImages")]/@src')[0] address = page.xpath('//*[contains(text(),"Address")]/ancestor::td')[-1].text_content().split(':')[-1].replace("\t", '') phone = page.xpath('//*[contains(text(),"Tel")]/ancestor::td')[-1].text_content().split(':')[-1].replace("\t", '') phone = phone.replace('(', '').replace(') ', '-') if page.xpath('//*[contains(text(),"Fax")]'): fax = page.xpath('//*[contains(text(),"Fax")]/ancestor::td')[-1].text_content().split(':')[-1].replace("\t", '') fax = fax.replace('(', '').replace(') ', '-') email = page.xpath('//a[contains(@href,"mailto:")]')[0].text_content() p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) p.image = image yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) # it's all javascript rendered on the client... wow. js = page.xpath( 'string(//div[@class="inner_container"]/div/script[2])') districts = re.findall(r'arrayDistricts\[a.+"(.+)"', js) members = re.findall(r'arrayMembres\[a.+"(.+)"', js) urls = re.findall(r'arrayLiens\[a.+"(.+)"', js) # first item in list is mayor p = Legislator(name=members[0], post_id='Gatineau', role='Maire') p.add_source(COUNCIL_PAGE) mayor_page = lxmlize(MAYOR_CONTACT_PAGE) p.add_source(MAYOR_CONTACT_PAGE) email = '*****@*****.**' # hardcoded p.add_contact('email', email, None) yield p for district, member, url in zip(districts, members, urls)[1:]: profile_url = COUNCIL_PAGE + '/' + url.split('/')[-1] profile_page = lxmlize(profile_url) photo_url = profile_page.xpath('string(//img/@src)') post_id = 'District ' + re.search('\d+', district).group(0) email = profile_page.xpath( 'string(//a[contains(@href, "mailto:")]/@href)')[len('mailto:' ):] p = Legislator(name=member, post_id=post_id, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(profile_url) p.image = photo_url p.add_contact('email', email, None) yield p
def scrape_mayor(url): page = lxmlize(url) name = page.xpath('//tr/td/p')[-1] name = name.text_content().replace('Mayor', '') image = page.xpath('//div[@class="sask_ArticleBody"]//img/@src')[0] contact_url = page.xpath( '//a[contains(text(), "Contact the Mayor")]/@href')[0] page = lxmlize(contact_url) address = ' '.join( page.xpath( '//div[@id="ctl00_PlaceHolderMain_RichHtmlField1__ControlWrapper_RichHtmlField"]/p[4]/text()' )[1:]) phone = page.xpath( '//div[@id="ctl00_PlaceHolderMain_RichHtmlField1__ControlWrapper_RichHtmlField"]/p[5]/span/text()' )[0].replace('(', '').replace(') ', '-') fax = page.xpath( '//div[@id="ctl00_PlaceHolderMain_RichHtmlField1__ControlWrapper_RichHtmlField"]/p[6]/span/text()' )[0].replace('(', '').replace(') ', '-') p = Legislator(name=name, post_id='Saskatoon', role='Mayor') p.add_source(url) p.image = image p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') return p
def scrape_mayor(self, name, url): page = lxmlize(url) contact = page.xpath( '//div[@id="secondary align_RightSideBar"]/blockquote/p/text()') phone = contact[0] fax = contact[1] email = page.xpath( '//div[@id="secondary align_RightSideBar"]/blockquote/p/a[contains(@href, "mailto:")]/text()' )[0] mayor_page = lxmlize('http://www.burlingtonmayor.com') contact_url = mayor_page.xpath( '//div[@class="menu"]//a[contains(text(),"Contact")]' )[0].attrib['href'] mayor_page = lxmlize(contact_url) address = mayor_page.xpath( '//div[@class="entry-content"]//p[contains(text(),"City Hall")]' )[0].text_content() p = Legislator(name=name, post_id="Burlington", role='Mayor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_source('http://www.burlingtonmayor.com') p.image = page.xpath( '//div[@id="secondary align_RightSideBar"]/p/img/@src')[0] p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) p.add_contact('address', address, 'legislature') return p
def scrape_mayor(self, div): url = div.attrib['href'] page = lxmlize(url) name = div.text_content().replace('Mayor ', '') contact_url = page.xpath('//ul[@class="navSecondary"]//a[contains(text(),"Contact")]')[0].attrib['href'] page = lxmlize(contact_url) contact_div = page.xpath('//div[@class="col"][2]')[0] address = contact_div.xpath('.//p[1]')[0].text_content() address = re.findall(r'(City of Greater .*)', address, flags=re.DOTALL)[0] phone = contact_div.xpath('.//p[2]')[0].text_content() phone = phone.replace('Phone: ', '') fax = contact_div.xpath('.//p[3]')[0].text_content() fax = fax.split(' ')[-1] email = contact_div.xpath('//a[contains(@href, "mailto:")]')[0].text_content() p = Legislator(name=name, post_id='Greater Sudbury', role='Mayor') p.add_source(COUNCIL_PAGE) p.add_source(contact_url) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) return p
def get_people(self): page = lxmlize(COUNCIL_PAGE) mayor_url = page.xpath('//a[contains(text(), "Mayor")]/@href')[0] yield self.scrape_mayor(mayor_url) councillors_url = page.xpath('//a[contains(text(), "Councillors")]/@href')[0] cpage = lxmlize(councillors_url) councillor_rows = cpage.xpath('//tr[td//img]')[:-1] for councillor_row in councillor_rows: img_cell, info_cell = tuple(councillor_row) name = info_cell.xpath( 'string(.//span[contains(text(), "Councillor")])')[len('Councillor '):] district = info_cell.xpath('string(.//p[contains(text(), "District")])') email = info_cell.xpath('string(.//a[contains(@href, "mailto:")])') if not email: email = info_cell.xpath('string(.//strong[contains(text(), "E-mail")]/following-sibling::text())') phone = info_cell.xpath( 'string(.//p[contains(.//text(), "Telephone:")])').split(':')[1] img_url_rel = img_cell.xpath('string(//img/@href)') img_url = urljoin(councillors_url, img_url_rel) p = Legislator(name=name, post_id=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(councillors_url) p.add_contact('email', email, None) p.add_contact('voice', phone, 'legislature') p.image = img_url yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE, 'iso-8859-1') nodes = page.xpath('//table[@width="484"]//tr') try: for district_row, councillor_row, contact_row, _ in chunks( nodes, 4): post_id = district_row.xpath('string(.//strong)') name = councillor_row.xpath('string(.)')[len('Councillor '):] # TODO: phone numbers on site don't include area code. Add manually? #phone = contact_row.xpath('string(td[2]/text())') email = contact_row.xpath('string(td[4]/a)').replace( '[at]', '@') p = Legislator(name=name, post_id=post_id, role='Councillor') p.add_source(COUNCIL_PAGE) #p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) yield p except ValueError: # on the last run through, there will be less than 4 rows to unpack pass mayor_page = lxmlize(MAYOR_PAGE, 'iso-8859-1') name = mayor_page.xpath( 'string(//h1[contains(., "Bio")])')[:-len(' Bio')] contact_page = lxmlize(MAYOR_CONTACT_URL, 'iso-8859-1') email = contact_page.xpath('string(//a[contains(., "@")][1])') p = Legislator(name=name, post_id='Halifax', role='Councillor') p.add_source(MAYOR_PAGE) p.add_source(MAYOR_CONTACT_URL) p.add_contact('email', email, None) yield p
def scrape_mayor(self, div): url = div.attrib['href'] page = lxmlize(url) name = div.text_content().replace('Mayor ', '') contact_url = page.xpath( '//ul[@class="navSecondary"]//a[contains(text(),"Contact")]' )[0].attrib['href'] page = lxmlize(contact_url) contact_div = page.xpath('//div[@class="col"][2]')[0] address = contact_div.xpath('.//p[1]')[0].text_content() address = re.findall(r'(City of Greater .*)', address, flags=re.DOTALL)[0] phone = contact_div.xpath('.//p[2]')[0].text_content() phone = phone.replace('Phone: ', '') fax = contact_div.xpath('.//p[3]')[0].text_content() fax = fax.split(' ')[-1] email = contact_div.xpath( '//a[contains(@href, "mailto:")]')[0].text_content() p = Legislator(name=name, post_id='Greater Sudbury', role='Mayor') p.add_source(COUNCIL_PAGE) p.add_source(contact_url) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) return p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="article-content"]//td[@class="ms-rteTableOddCol-0"]') yield scrape_mayor(councillors[0]) for councillor in councillors[1:]: if not councillor.xpath('.//a'): continue name = councillor.xpath('.//a')[0].text_content().strip() district = councillor.xpath('.//a')[1].text_content() url = councillor.xpath('.//a/@href')[0] page = lxmlize(url) p = Legislator(name=name, post_id=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = councillor.xpath('./preceding-sibling::td//img/@src')[-1] contacts = page.xpath('.//td[@class="ms-rteTableOddCol-0"]//text()') for contact in contacts: if re.findall(r'[0-9]', contact): phone = contact.strip().replace(' ', '-') p.add_contact('voice', phone, 'legislature') get_links(p, page.xpath('.//td[@class="ms-rteTableOddCol-0"]')[0]) email = page.xpath( 'string(//a[contains(@href, "mailto:")]/@href)')[len('mailto:'):] p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="c2087"]//a') for councillor in councillors: name = councillor.text_content() url = councillor.attrib['href'] page = lxmlize(url) if 'Maire' in page.xpath('//h2/text()')[0]: district = 'Sherbrooke' role = 'Maire' else: district = page.xpath('//div[@class="csc-default"]//a[@target="_blank"]/text()')[0].replace('district', '').replace('Domaine Howard', 'Domaine-Howard').strip() role = 'Conseiller' if district in ('de Brompton', 'de Lennoxville'): district = district.replace('de ', '') p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('//div[@class="csc-textpic-image csc-textpic-last"]//img/@src')[0] parts = page.xpath('//li[contains(text(), "phone")]/text()')[0].split(':') note = parts[0] phone = parts[1] p.add_contact(note, phone, note) email = page.xpath('//a[contains(@href, "mailto:")]/@href') if email: email = email[0].split(':')[1] p.add_contact('email', email, None) if district == 'Brompton': p.add_extra('boundary_url', '/boundaries/sherbrooke-boroughs/brompton/') elif district == 'Lennoxville': p.add_extra('boundary_url', '/boundaries/sherbrooke-boroughs/lennoxville/') yield p
def get_people(self): # mayor first, can't find email page = lxmlize(MAYOR_URL) photo_url = page.xpath('string(//img/@src[contains(., "Maire")])') name = page.xpath('string(//td[@class="contenu"]/text()[last()])') p = Legislator(name=name, post_id=u"Trois-Rivières", role="Maire", image=photo_url) p.add_source(MAYOR_URL) yield p resp = requests.get(COUNCIL_PAGE) # page rendering through JS on the client page_re = re.compile(r'createItemNiv3.+"District (.+?)".+(index.+)\\"') for district, url_rel in page_re.findall(resp.text): if district not in ("des Estacades", "des Plateaux", "des Terrasses", "du Sanctuaire"): district = re.sub("\A(?:de(?: la)?|des|du) ", "", district) url = urljoin(COUNCIL_PAGE, url_rel) page = lxmlize(url) name = page.xpath("string(//h2)") email = page.xpath('string(//a/@href[contains(., "mailto:")])')[len("mailto:") :] photo_url = page.xpath('string(//img/@src[contains(., "Conseiller")])') p = Legislator(name=name, post_id=district, role="Conseiller", image=photo_url) p.add_source(url) p.add_contact("email", email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//ul[@class="subNav top"]/li/ul//li/a') for councillor in councillors: name = councillor.text_content() url = councillor.attrib['href'] page = lxmlize(url) if councillor == councillors[0]: district = 'Ajax' role = 'Mayor' else: district = re.findall(r'Ward.*', page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content())[0].strip() role = page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content() role = re.findall('((Regional)? ?(Councillor))', role)[0][0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('//div[@class="intQuicklinksPhoto"]/img/@src')[0] contact_info = page.xpath('//table[@class="datatable"][1]//tr')[1:] for line in contact_info: contact_type = line.xpath('./td')[0].text_content().strip() contact = line.xpath('./td')[1].text_content().strip() if re.match(r'(Phone)|(Fax)|(Email)', contact_type): contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type] p.add_contact(contact_type, contact, None if contact_type == 'email' else 'legislature') else: p.add_link(contact, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) member_cells = page.xpath( '//div[@class="views-field views-field-field-picture"]/' 'parent::td') for cell in member_cells: name = cell[1].text_content().replace(' .', '. ') # typo on page riding = cell[2].text_content() if 'Mackenzie Delta' in riding: riding = 'Mackenzie-Delta' detail_url = cell[0].xpath('string(.//a/@href)') detail_page = lxmlize(detail_url) photo_url = detail_page.xpath( 'string(//div[@class="field-item even"]/img/@src)') email = detail_page.xpath('string(//a[contains(@href, "mailto:")])') contact_text = detail_page.xpath( 'string(//div[@property="content:encoded"]/p[1])') phone = re.search(r'P(hone)?: ([-0-9]+)', contact_text).group(2) p = Legislator(name=name, post_id=riding, role='MLA', image=photo_url) p.add_source(COUNCIL_PAGE) p.add_source(detail_url) p.add_contact('email', email, None) p.add_contact('voice', phone, 'legislature') yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="navMultilevel"]//a') for councillor in councillors: if councillor == councillors[0]: yield self.scrape_mayor(councillor) continue if not '-' in councillor.text_content(): break district, name = councillor.text_content().split(' - ') if name == 'Vacant': continue page = lxmlize(councillor.attrib['href']) address = page.xpath('//div[@class="column last"]//p')[0].text_content() phone = page.xpath('//article[@id="primary"]//*[contains(text(),"Tel")]')[0].text_content() phone = re.findall(r'([0-9].*)', phone)[0].replace(') ', '-') fax = page.xpath('//article[@id="primary"]//*[contains(text(),"Fax")]')[0].text_content() fax = re.findall(r'([0-9].*)', fax)[0].replace(') ', '-') email = page.xpath('//a[contains(@href, "mailto:")]')[0].text_content() p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(councillor.attrib['href']) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) p.image = page.xpath('//article[@id="primary"]//img/@src')[1] yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillor_links = page.xpath('//li[@id="pageid2117"]/ul/li/a')[2:10] for link in councillor_links: if not link.text.startswith('Councillor'): continue url = link.attrib['href'] page = lxmlize(url) mail_link = page.xpath('//a[@title]')[0] name = mail_link.attrib['title'] email = mail_link.attrib['href'][len('mailto:'):] photo_url = page.xpath('string(//div[@class="pageContent"]//img[@align="right"]/@src)') p = Legislator(name=name, post_id='Abbotsford', role='Councillor', image=photo_url) p.add_source(url) p.add_contact('email', email, None) yield p page = lxmlize(MAYOR_URL) name = page.xpath('string(//h1)').split(' ', 1)[1] photo_url = page.xpath('string(//img[@hspace=10]/@src)') # email is hidden behind a form p = Legislator(name=name, post_id='Abbotsford', role='Mayor', image=photo_url) p.add_source(MAYOR_URL) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) # it's all javascript rendered on the client... wow. js = page.xpath('string(//div[@class="inner_container"]/div/script[2])') districts = re.findall(r'arrayDistricts\[a.+"(.+)"', js) members = re.findall(r'arrayMembres\[a.+"(.+)"', js) urls = re.findall(r'arrayLiens\[a.+"(.+)"', js) # first item in list is mayor p = Legislator(name=members[0], post_id = 'Gatineau', role='Maire') p.add_source(COUNCIL_PAGE) mayor_page = lxmlize(MAYOR_CONTACT_PAGE) p.add_source(MAYOR_CONTACT_PAGE) email = '*****@*****.**' # hardcoded p.add_contact('email', email, None) yield p for district, member, url in zip(districts, members, urls)[1:]: profile_url = COUNCIL_PAGE + '/' + url.split('/')[-1] profile_page = lxmlize(profile_url) photo_url = profile_page.xpath('string(//img/@src)') post_id = 'District ' + re.search('\d+', district).group(0) email = profile_page.xpath( 'string(//a[contains(@href, "mailto:")]/@href)')[len('mailto:'):] p = Legislator(name=member, post_id=post_id, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(profile_url) p.image = photo_url p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillor_divs = page.xpath('//div[@class="councillorCard"]'); for councillor_div in councillor_divs: yield councillor_data(councillor_div) mayor_page = lxmlize(MAYOR_PAGE) yield mayor_data(mayor_page)
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="printArea"]//table//tr//td')[4:-1] yield self.scrape_mayor(councillors[0]) for councillor in councillors[1:]: name = ' '.join( councillor.xpath('string(.//strong/a[last()])').split()) infostr = councillor.xpath('string(.//strong)') try: district = infostr.split('-')[1] role = 'Councillor' except IndexError: district = 'Newmarket' role = 'Regional Councillor' url = councillor.xpath('.//a/@href')[0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = councillor.xpath('.//img/@src')[0] page = lxmlize(url) info = page.xpath('//div[@id="printArea"]')[0] info = info.xpath('.//p[@class="heading"][2]/following-sibling::p') address = info.pop(0).text_content().strip() if not address: address = info.pop(0).text_content().strip() if 'Ward' in info[0].text_content(): info.pop(0) numbers = info.pop(0).text_content().split(':') email = page.xpath('//a[contains(@href, "mailto:")]/text()')[0] p.add_contact('email', email, None) for i, contact in enumerate(numbers): if i == 0: continue if '@' in contact: continue # executive assistant email else: number = re.findall(r'([0-9]{3}-[0-9]{3}-[0-9]{4})', contact)[0] ext = re.findall(r'(Ext\. [0-9]{3,4})', contact) if ext: number = number + ext[0].replace('Ext. ', ' x') contact_type = re.findall(r'[A-Za-z]+$', numbers[i - 1])[0] if 'Fax' in contact_type: p.add_contact('fax', number, 'legislature') elif 'Phone' in contact_type: p.add_contact('voice', number, 'legislature') else: p.add_contact(contact_type, number, contact_type) site = page.xpath('.//a[contains(text(), "http://")]') if site: p.add_link(site[0].text_content(), None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//a[contains(@title, "Profile")][1]/@href') for councillor in councillors: page = lxmlize(councillor) info = page.xpath('//table/tbody/tr/td[2]')[0] for br in info.xpath('*//br'): br.tail = '\n' + br.tail if br.tail else '\n' lines = [ line.strip() for line in info.text_content().split('\n') if line.strip() ] text = '\n'.join(lines) name = lines[0].replace('Councillor ', '').replace('Mayor ', '') if lines[1].endswith(' Ward'): district = lines[1].replace(' Ward', '') role = 'Councillor' elif lines[1] == 'At Large': district = 'Thunder Bay' role = 'Councillor' else: district = 'Thunder Bay' role = 'Mayor' name = name.replace('Councillor', '').replace('At Large', '').replace('Mayor', '').strip() p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(councillor) p.image = page.xpath('//td[@valign="top"]/img/@src')[0] address = ', '.join(info.xpath('./p/text()')[0:2]).strip() address = re.sub(r'\s{2,}', ' ', address) p.add_contact('address', address, 'legislature') contacts = info.xpath('./p[2]/text()') for contact in contacts: contact_type, contact = contact.split(':') contact = contact.replace('(1st)', '').replace('(2nd)', '').strip() if 'Fax' in contact_type: p.add_contact('fax', contact, 'legislature') elif 'Email' in contact_type: break else: p.add_contact('voice', contact, contact_type) email = info.xpath( './/a[contains(@href, "mailto:")]')[0].text_content() p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="printArea"]//table//tr//td')[4:-1] yield self.scrape_mayor(councillors[0]) for councillor in councillors[1:]: name = ' '.join(councillor.xpath('string(.//strong/a[last()])').split()) infostr = councillor.xpath('string(.//strong)') try: district = infostr.split('-')[1] role = 'Councillor' except IndexError: district = 'Newmarket' role = 'Regional Councillor' url = councillor.xpath('.//a/@href')[0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = councillor.xpath('.//img/@src')[0] page = lxmlize(url) info = page.xpath('//div[@id="printArea"]')[0] info = info.xpath('.//p[@class="heading"][2]/following-sibling::p') address = info.pop(0).text_content().strip() if not address: address = info.pop(0).text_content().strip() if 'Ward' in info[0].text_content(): info.pop(0) numbers = info.pop(0).text_content().split(':') email = page.xpath('//a[contains(@href, "mailto:")]/text()')[0] p.add_contact('email', email, None) for i, contact in enumerate(numbers): if i == 0: continue if '@' in contact: continue # executive assistant email else: number = re.findall(r'([0-9]{3}-[0-9]{3}-[0-9]{4})', contact)[0] ext = re.findall(r'(Ext\. [0-9]{3,4})', contact) if ext: number = number + ext[0].replace('Ext. ', ' x') contact_type = re.findall(r'[A-Za-z]+$', numbers[i - 1])[0] if 'Fax' in contact_type: p.add_contact('fax', number, 'legislature') elif 'Phone' in contact_type: p.add_contact('voice', number, 'legislature') else: p.add_contact(contact_type, number, contact_type) site = page.xpath('.//a[contains(text(), "http://")]') if site: p.add_link(site[0].text_content(), None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) a = page.xpath('//a[contains(@href,"mayor")]')[0] yield self.scrape_mayor(a.attrib['href']) for a in page.xpath('//a[contains(@href,"councillors/")]'): page = lxmlize(a.attrib['href']) h1 = page.xpath('string(//h1)') if 'Council seat is vacant' not in h1: yield self.scrape_councilor(page, h1, a.attrib['href'])
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//*[@class="two_third last"]') for councillor in councillors: if councillor == councillors[0]: yield self.scrape_mayor(councillor) continue name = councillor.xpath('.//a')[0].text_content().replace( 'Councillor', '').replace('Mayor', '') info = councillor.xpath('.//text()[normalize-space()]') district = info[2] url = councillor.xpath('.//a')[0].attrib['href'] p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('voice', info[3].replace('extension', 'x'), 'legislature') email = councillor.xpath('.//a[contains(@href,"mailto:")]') if email: email = email[0].text_content() p.add_contact('email', email, None) site = councillor.xpath('.//a[contains(text(),"Website")]') if site: p.add_link(site[0].attrib['href'], None) page = lxmlize(url) p.image = page.xpath('//header/img/@src')[0] address = re.findall( r'Address: (.*)Phone', page.xpath('//div[@class="entry-content"]')[0].text_content()) if address: p.add_contact('address', address[0], 'legislature') blog = page.xpath('//a[contains(text(),"Blog")]') if blog: p.add_link(blog[0].attrib['href'], None) facebook = page.xpath( '//div[@class="entry-content"]//a[contains(@href, "facebook")]' ) if facebook: p.add_link(facebook[0].attrib['href'], None) twitter = page.xpath( '//div[@class="entry-content"]//a[contains(@href, "twitter")]') if twitter: p.add_link(twitter[0].attrib['href'], None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillor_pages = page.xpath('//div[@class="imageLinkContent"]/' 'a[starts-with(text(), "Ward")]/@href') for councillor_page in councillor_pages: yield councillor_data(councillor_page) mayor_page = lxmlize(MAYOR_PAGE) mayor_connecting_url = mayor_page.xpath('string(//a[@class="headingLink"]' '[contains(text(), "Connecting")]/@href)') yield mayor_data(mayor_connecting_url)
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillor_pages = page.xpath('//div[@class="imageLinkContent"]/' 'a[starts-with(text(), "Ward")]/@href') for councillor_page in councillor_pages: yield councillor_data(councillor_page) mayor_page = lxmlize(MAYOR_PAGE) mayor_connecting_url = mayor_page.xpath( 'string(//a[@class="headingLink"]' '[contains(text(), "Connecting")]/@href)') yield mayor_data(mayor_connecting_url)
def get_people(self): page = lxmlize(COUNCIL_PAGE) types = page.xpath('//div[@class="bluearrow shaded bottomborder "][1]/ul/li/a/@href')[:4] for org_type, link in enumerate(types): page = lxmlize(link) district_urls = page.xpath('//div[@class="parbase list section cplist"]/table/tr/td[1]/b/a/@href') for district_url in district_urls: page = lxmlize(district_url) district = page.xpath('//div[@class="pageHeader"]/h1/text()')[0].split(' - ')[1].strip() org = Organization(name=district + org_types[org_type], classification='legislature', jurisdiction_id=self.jurisdiction.jurisdiction_id) org.add_source(district_url) yield org address = ', '.join(page.xpath('//div[@class="left_contents"]/p[1]/text()')) contacts = page.xpath('//div[@class="left_contents"]/p[b[text() = "Contact"]]/text()') phone = contacts[0].split(':')[1].strip().replace(' ', '-') fax = contacts[1].split(':')[1].strip().replace(' ', '-') email = page.xpath('//div[@class="left_contents"]//a[contains(@href, "mailto:")]') if email: email = email[0].text_content() site = page.xpath('//div[@class="left_contents"]//a[not(contains(@href,"mailto:"))]') if site: site = site[0].text_content() councillors = page.xpath('//div[@class="right_contents"]//p/text()') for i, councillor in enumerate(councillors): if 'Vacant' in councillor: continue p = Legislator(name=councillor, post_id=district) p.add_source(COUNCIL_PAGE) p.add_source(link) p.add_source(district_url) if i == 0: membership = p.add_membership(org, role='Mayor') else: membership = p.add_membership(org, role='Councillor') membership.post_id = district membership.add_contact_detail('address', address, 'legislature') if phone: membership.add_contact_detail('voice', phone, 'legislature') if fax: membership.add_contact_detail('fax', fax, 'legislature') if email: membership.add_contact_detail('email', email, None) if site: p.add_link(site, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//a[contains(@title, "Profile")][1]/@href') for councillor in councillors: page = lxmlize(councillor) info = page.xpath('//table/tbody/tr/td[2]')[0] for br in info.xpath('*//br'): br.tail = '\n' + br.tail if br.tail else '\n' lines = [line.strip() for line in info.text_content().split('\n') if line.strip()] text = '\n'.join(lines) name = lines[0].replace('Councillor ', '').replace('Mayor ', '') if lines[1].endswith(' Ward'): district = lines[1].replace(' Ward', '') role = 'Councillor' elif lines[1] == 'At Large': district = 'Thunder Bay' role = 'Councillor' else: district = 'Thunder Bay' role = 'Mayor' name = name.replace('Councillor', '').replace('At Large', '').replace('Mayor', '').strip() p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(councillor) p.image = page.xpath('//td[@valign="top"]/img/@src')[0] address = ', '.join(info.xpath('./p/text()')[0:2]).strip() address = re.sub(r'\s{2,}', ' ', address) p.add_contact('address', address, 'legislature') contacts = info.xpath('./p[2]/text()') for contact in contacts: contact_type, contact = contact.split(':') contact = contact.replace('(1st)', '').replace('(2nd)', '').strip() if 'Fax' in contact_type: p.add_contact('fax', contact, 'legislature') elif 'Email' in contact_type: break else: p.add_contact('voice', contact, contact_type) email = info.xpath('.//a[contains(@href, "mailto:")]')[0].text_content() p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) mayor_url = page.xpath('//a[contains(text(), "Office of the Mayor")]/@href')[0] yield scrape_mayor(mayor_url) councillors = page.xpath('//div[@class="interiorContentWrapper"]//td[./a]') for councillor in councillors: name = councillor.xpath('.//strong')[1].text_content().strip() district = councillor.xpath('.//a//text()[normalize-space()]')[0] if 'Ward' in district: district = district.replace('Councillor', '') role = 'Councillor' else: role = district district = 'Markham' image = councillor.xpath('.//img/@src')[0] url = councillor.xpath('.//a/@href')[0] if 'Ward 4' in district: yield scrape_4(name, url, image) continue page = lxmlize(url) p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = image contact = page.xpath('//div[@class="microSiteLinksWrapper"]')[1] if contact.xpath('.//p/text()'): infos = contact.xpath('.//p/text()') else: infos = contact.xpath('.//div/text()') address = re.sub(r'\s{2,}', ' ', ' '.join(infos[:2])).strip() phone = infos[2].split(':')[1].strip() email = contact.xpath('.//a[contains(@href,"mailto:")]/text()')[0] website = contact.xpath('.//a[not( contains(@href, "mailto:"))]/text()') if website: p.add_link(website[0], None) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) get_links(p, contact) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) for person_link in page.xpath('//a[@class="L4"]'): role, name = person_link.text_content().split(' ', 1) url = person_link.attrib['href'] page = lxmlize(url) photo_url = page.xpath('string(//img[@class="img-right"]/@src)') email = page.xpath('string(//a[starts-with(@href, "mailto:")])') p = Legislator(name=name, post_id='Coquitlam', role=role, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table[@id="MLAs"]//tr')[1:] for councillor in councillors: name = councillor.xpath('./td')[0].text_content().split('. ', 1)[1] party = councillor.xpath('./td')[1].text district = councillor.xpath('./td')[2].text_content() url = councillor.xpath('./td[1]/a/@href')[0] page = lxmlize(url) p = Legislator(name=name, post_id=district, role='MLA', party=party) p.add_source(COUNCIL_PAGE) p.add_source(url) contact = page.xpath('//table[@id="mla-contact"]//tr[2]')[0] website = contact.xpath('./td[3]//div[3]//a') if website: p.add_link(website[0].text_content(), None) p.add_contact('address', contact.xpath('./td[1]/div[2]')[0].text_content(), 'legislature') p.add_contact('address', ''.join(contact.xpath('./td[2]/div//text()')[1:7]), 'constituency') numbers = [ contact.xpath('./td[1]/div[3]')[0].text_content().split( ':')[1].strip(), contact.xpath('./td[2]/div[4]//span/text()')[0], contact.xpath('./td[1]/div[4]')[0].text_content().split(':') [1].strip(), contact.xpath('./td[2]/div[5]//span/text()')[0], ] for index, number in enumerate(numbers): if len(number) < 10: numbers[index] = '306-%s' % number p.add_contact('voice', numbers[0], 'legislature') p.add_contact('voice', numbers[1], 'constituency') p.add_contact('fax', numbers[2], 'legislature') p.add_contact('fax', numbers[3], 'constituency') p.add_contact( 'email', contact.xpath('./td[3]//a[contains(@href, "mailto:")]/text()') [0], None) yield p
def get_people(self): reeve_page = lxmlize(REEVE_URL) reeve_name = reeve_page.xpath('string(//b)').split(',')[0] page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table[@class="table-plain"]/tbody/tr/td[2]') for councillor in councillors: name = councillor.xpath('./h2')[0].text_content().split( 'Division')[0].strip() district = re.findall(r'(Division [0-9])', councillor.xpath('./h2')[0].text_content())[0] p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) image = councillor.xpath('./preceding-sibling::td//img/@src')[0] p.image = image address = councillor.xpath('./p[1]')[0].text_content() email = councillor.xpath('.//a[contains(@href, "mailto:")]')[0].text_content() p.add_contact('address', address, 'legislature') p.add_contact('email', email, None) numbers = councillor.xpath('./p[2]')[0].text_content().replace('Email: ', '').replace(email, '').split(':') for index, number in enumerate(numbers): if index == 0: continue contact_type = re.findall(r'[A-Za-z]+', numbers[index - 1])[0] number = re.findall(r'[0-9]{3}.[0-9]{3}.[0-9]{4}', number)[0].replace('.', '-') if contact_type == 'Fax': p.add_contact('fax', number, 'legislature') elif contact_type == 'Cell': p.add_contact('cell', number, 'legislature') elif contact_type == 'Hm': p.add_contact('voice', number, 'residence') else: raise Exception('Unrecognized contact type %s' % contact_type) # @todo Uncomment when upgrading from Pupa 0.0.3. # if name == reeve_name: # membership = Membership( # p._id, # 'jurisdiction::ocd-jurisdiction/country:ca/csd:4819006/council', # post_id='district::Grande Prairie County No. 1', # contact_details=p._contact_details, # role='Reeve') # p._related.append(membership) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="PL_Column1"]//ul[@class="dfwp-list"][1]/li/div/div/a') for councillor in councillors: url = councillor.attrib['href'] page = lxmlize(url) title = page.xpath('//div[@class="PL_Title"]')[0].text_content() if "Councillor" in title: district, name = re.split(r'Councillor', title) role = 'Councillor' if "Regional" in district: district = "Vaughan" role = 'Regional Councillor' else: name = re.split(r'Mayor', title)[-1] district = 'Vaughan' role = 'Mayor' name = name.strip() if councillor == councillors[0]: contact_info = page.xpath('//div[@id="WebPartWPQ2"]')[0] else: contact_info = page.xpath('//div[@id="WebPartWPQ3"]')[0] phone = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4} ext. [0-9]{4}', contact_info.text_content())[0].replace('ext. ', 'x') fax = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4}', contact_info.text_content())[1] email = contact_info.xpath('.//a[contains(@href, "mailto:")]')[0].text_content() p = Legislator(name=name, post_id=district.strip(), role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) image = page.xpath('//img[contains(@alt, "Councillor")]/@src') if image: p.image = image[0] sites = page.xpath('//div[@id="WebPartWPQ5"]')[0] if page.xpath('.//a[contains(@href,"facebook")]'): p.add_link(page.xpath('.//a[contains(@href,"facebook")]')[0].attrib['href'], None) if page.xpath('.//a[contains(@href,"twitter")]'): p.add_link(page.xpath('.//a[contains(@href,"twitter")]')[0].attrib['href'], None) if page.xpath('.//a[contains(@href,"youtube")]'): p.add_link(page.xpath('.//a[contains(@href, "youtube")]')[0].attrib['href'], None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) mayor_url = page.xpath( '//td[@class="sask_LeftNavLinkContainer"]/a/@href')[0] yield scrape_mayor(mayor_url) email_page = lxmlize(EMAIL_URL) c_options = email_page.xpath( '//select[@id="councillorList"]/option[contains(text(), "Ward")]') email_dict = dict((opt.text.split(' - ')[0], opt.attrib['value']) for opt in c_options) councillors = page.xpath( '//td[@class="sask_LeftNavChildNodeContainer"]//a') for councillor in councillors: district, name = councillor.text_content().split(' - Councillor ') url = councillor.attrib['href'] p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) page = lxmlize(url) try: p.add_contact('email', email_dict[district], None) except KeyError: email = page.xpath( 'string(//a[contains(@href, "mailto:")]/@href)') p.add_contact('email', email, None) contacts = page.xpath('//p[@class="para12"]')[0] if not contacts.text_content().strip(): contacts = page.xpath('//p[@class="para12"]')[1] contacts = re.split(r'\xa0', contacts.text_content()) contacts = [x for x in contacts if x.strip()] for i, contact in enumerate(contacts): if 'Contact' in contact: continue if contact == contacts[-1]: break contact_type = contact.replace(':', '').strip() value = contacts[i + 1].replace('(', '').replace(') ', '-').strip() if 'Fax' in contact_type: p.add_contact('fax', value, 'legislature') if 'Phone' in contact_type: p.add_contact(contact_type, value, contact_type) yield p
def mayor_data(url, name): page = lxmlize(url) photo_url = urljoin(url, page.xpath('string((//div[@id="contentcontainer"]//img)[1]/@src)')) contact_page = lxmlize(MAYOR_CONTACT_URL) email = contact_page.xpath('string(//a[contains(., "@")][1])') m = Legislator(name=name, post_id='Regina', role='Mayor') m.add_source(COUNCIL_PAGE) m.add_source(url) m.add_source(MAYOR_CONTACT_URL) m.add_contact('email', email, None) m.image = photo_url return m
def get_people(self): contact_page = lxmlize(CONTACT_URL) email = contact_page.xpath('string(//a[starts-with(@href, "mailto:")])') page = lxmlize(COUNCIL_PAGE) for url in page.xpath('//a/@href[contains(., "members/")]'): page = lxmlize(url) role, name = page.xpath('string(//h1)').split(' ', 1) # image element is inserted by a script somewhere #photo_url = page.xpath('string(//span[@class="imageShadow"]/img/@src)') p = Legislator(name=name, post_id='Richmond', role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) rows = page.xpath('//div[@class="main-content"]//tr')[1:] for row in rows: name_cell = row.xpath('./td[1]')[0] last_name = name_cell.xpath('string(.//span[1])') first_name = name_cell.xpath('string(.//span[2])') name = '%s %s' % (first_name, last_name) constituency = row.xpath('string(./td[2])') province = row.xpath('string(./td[3])') party = row.xpath('string(./td[4])') url = name_cell.xpath('string(.//a/@href)') mp_page = lxmlize(url) email = mp_page.xpath('string(//span[@class="caucus"]/' 'a[contains(., "@")])') photo = mp_page.xpath('string(//div[@class="profile overview header"]//' 'img/@src)') m = Legislator(name=name, post_id=constituency, role='MP', chamber='lower', party=party) m.add_source(COUNCIL_PAGE) m.add_source(url) m.add_contact('email', email, None) m.image = photo m.add_contact('address', 'House of Commons\nOttawa ON K1A 0A6', 'legislature') voice = mp_page.xpath('string(//div[@class="hilloffice"]//span[contains(text(), "Telephone:")])') if voice: m.add_contact('voice', voice.replace('Telephone: ', ''), 'legislature') fax = mp_page.xpath('string(//div[@class="hilloffice"]//span[contains(text(), "Fax:")])').replace('Fax: ', '') if fax: m.add_contact('fax', fax, 'legislature') for li in mp_page.xpath('//div[@class="constituencyoffices"]//li'): spans = li.xpath('./span[not(@class="spacer")]') m.add_contact('address', '\n'.join([ spans[0].text_content(), # address spans[1].text_content(), # city, region spans[2].text_content(), # postal code ]), 'constituency') voice = li.xpath('string(./span[contains(text(), "Telephone:")])').replace('Telephone: ', '') if voice: m.add_contact('voice', voice, 'constituency') fax = li.xpath('string(./span[contains(text(), "Fax:")])').replace('Fax: ', '') if fax: m.add_contact('fax', fax, 'constituency') yield m
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath( '//div[@id="ctl00_ContentPlaceHolder1_ContentBlock1"]//a/parent::p' ) for councillor in councillors: if not councillor.text_content().strip(): continue if 'Mayor' in councillor.text_content(): name = councillor.text_content().replace('Mayor ', '') district = 'Haldimand County' role = 'Mayor' else: district, name = councillor.text_content().split(' - ') name = name.replace('Councillor', '').strip() district = district.strip() role = 'Councillor' url = councillor.xpath('.//a')[0].attrib['href'] page = lxmlize(url) p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath( '//div[@id="ctl00_ContentPlaceHolder1_ContentBlock1"]//tr[1]/td//img/@src' )[0] info = page.xpath( '//a[contains(@href, "mailto:")]/parent::*/text()') for i, field, in enumerate(info): if re.match(r'[0-9]+ [A-Z]', field): address = field + ', ' + info[i + 1] + ', ' + info[i + 2] p.add_contact('address', address, 'legislature') if re.findall(r'[0-9]{3} [0-9]{3} [0-9]{4}', field): if 'Fax' in field: num = field.replace('Fax: ', '').strip().replace(' ', '-') p.add_contact('fax', num, 'legislature') else: num = field.replace('Telephone: ', '').strip().replace(' ', '-') p.add_contact('voice', num, 'legislature') email = page.xpath('//a[contains(@href, "mailto:")]/text()')[0] p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) mayor = page.xpath('//td[@class="LeftLinksSectionMenu"]/a')[0] name = mayor.text_content().replace('Mayor', '').strip() url = mayor.attrib['href'] mayor_page = lxmlize(url) p = Legislator(name=name, post_id='Westmount', role='Maire') p.add_source(COUNCIL_PAGE) p.add_source(url) mayor_info = mayor_page.xpath( '//div[@style="padding-right:10px;"]/table')[0] phone = mayor_info.xpath('.//tr[2]/td[2]')[0].text_content().replace( ' ', '-') fax = mayor_info.xpath('.//tr[3]/td[2]')[0].text_content().replace( ' ', '-') email = mayor_info.xpath('.//tr[4]/td[2]')[0].text_content().strip() p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) yield p councillors = page.xpath( '//td[@class="LeftLinksSectionMenu" and contains(@style, "border-bottom-style: dashed;")]/a' ) for i, councillor in enumerate(councillors): name = councillor.text_content().strip() url = councillor.attrib['href'] page = lxmlize(url) if page.xpath('boolean(.//div[@class="SectionTitle"][2])'): district = page.xpath('.//div[@class="SectionTitle"]')[ 1].text_content().split('-')[0].strip() else: district = 'District ' + str(i + 1) info = page.xpath('.//div[@style="padding-right:10px;"]/table')[0] phone = info.xpath('.//tr[2]/td[2]')[0].text_content().replace( ' ', '-') email = info.xpath('.//tr[3]/td[2]')[0].text_content().strip() p = Legislator(name=name, post_id=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = info.xpath( './ancestor::td//div[not(@id="insert")]/img/@src')[0] p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) yield p
def get_details(url): page = lxmlize(url) image = page.xpath('string(//img[@class="portrait"]/@src)') phone = page.xpath('string(//dd[@class="numbers"]/text())').split(': ')[1] email_js = page.xpath('string(//dd/script)') email_addr = process_email(email_js) return image, phone, email_addr
def get_people(self): tmpdir = tempfile.mkdtemp() page = lxmlize(COUNCIL_PAGE) mayor = page.xpath('//div[@class="box"]/p/text()') m_name = mayor[0].strip().split('.')[1].strip() m_phone = mayor[1].strip().split(':')[1].strip() m = Legislator(name=m_name, post_id='Saguenay', role='Maire') m.add_source(COUNCIL_PAGE) m.add_contact('voice', m_phone, 'legislature') yield m councillors = page.xpath('//div[@class="box"]//div') for councillor in councillors: district = councillor.xpath('./h3')[0].text_content().replace('#', '') name = councillor.xpath('.//p/text()')[0].encode('latin-1').decode('utf-8') name = name.replace('M. ', '').replace('Mme ', '').strip() phone = councillor.xpath('.//p/text()')[1].split(':')[1].strip().replace(' ', '-') email = councillor.xpath('.//a[contains(@href, "mailto:")]')[0].text_content() url = councillor.xpath('./p/a')[0].attrib['href'] p = Legislator(name=name, post_id=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillor_trs = [ tr for tr in page.xpath('//table//tr[1]') if len(tr) == 2 ][:-1] for councillor_tr in councillor_trs: desc = [ text.strip() for text in councillor_tr.xpath('.//text()[normalize-space()]') if text.strip() ] if len(desc) == 3: role = 'Maire' district = u'Saint-Jérôme' else: role = 'Conseiller' district = desc[0].replace(u'numéro ', '') name = desc[-3] phone = desc[-2] email = desc[-1] image = councillor_tr.xpath('string(.//img/@src)')[0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.image = image p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="content"]//tr') for i, councillor in enumerate(councillors): if 'Maire' in councillor.text_content(): name = councillor.xpath('./td')[1].text_content() district = 'Sainte-Anne-de-Bellevue' role = 'Maire' else: name = councillor.xpath('./td')[1].text_content() district = 'District ' + re.findall( r'\d', councillor.xpath('./td')[0].text_content())[0] role = 'Conseiller' p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) email = councillor.xpath('.//a') if email: email = email[0].attrib['href'].replace('mailto:', '') p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE, user_agent='Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)') yield self.scrape_mayor(page) councillors = page.xpath('//strong[contains(text(), "Councillor")]/parent::p|//b[contains(text(), "Councillor")]/parent::p') for councillor in councillors: name = councillor.xpath('./strong/text()|./b/text()')[0].replace('Councillor', '').strip() district = re.findall('(?<=Ward \d, ).*', councillor.text_content())[0].strip() p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('.//img/@src')[0] phone = re.findall(r'Phone(.*)', councillor.text_content()) node = councillor while not phone: node = node.xpath('./following-sibling::p')[1] phone = re.findall(r'Phone(.*)', node.text_content()) phone = phone[0].strip() email = councillor.xpath('.//a[contains(@href, "mailto:")]') if not email: email = councillor.xpath('./following-sibling::p//a[contains(@href, "mailto")]') email = email[0].text_content() if len(re.sub(r'\D', '', phone)) == 7: phone = '902-%s' % phone p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) mayor = page.xpath('.//div[@class="item-page clearfix"]//table[1]//p')[1] name = mayor.xpath('.//strong/text()')[0] p = Legislator(name=name, post_id='Pointe-Claire', role='Maire') p.add_source(COUNCIL_PAGE) phone = re.findall(r'[0-9]{3}[ -][0-9]{3}-[0-9]{4}', mayor.text_content())[0].replace(' ', '-') p.add_contact('voice', phone, 'legislature') yield p rows = page.xpath('//tr') for i, row in enumerate(rows): if i % 2 == 0: continue councillors = row.xpath('./td') for j, councillor in enumerate(councillors): name = councillor.text_content() # rows[i + 1].xpath('.//td//a[contains(@href, "maps")]/text()')[j] # district number district = rows[i + 1].xpath('.//td/p[1]/text()')[j].replace(' / ', '/') p = Legislator(name=name, post_id=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('.//img/@src')[0] phone = re.findall(r'[0-9]{3}[ -][0-9]{3}-[0-9]{4}', rows[i + 1].xpath('.//td')[j].text_content())[0].replace(' ', '-') p.add_contact('voice', phone, 'legislature') yield p