def get_people(self): response = urlopen(COUNCIL_CSV_URL) cr = DictReader(response) for councillor in cr: name = '%s %s' % (councillor['First name'], councillor['Last name']) role = councillor['Elected office'] if role == 'Mayor': district = 'Ottawa' else: district = councillor['District name'] # Correct typos. The City has been notified of the errors. if district == u'Knoxdale Merivale': district = u'Knoxdale-Merivale' if district == u'Rideau Vanier': district = u'Rideau-Vanier' if district == u'Orleans': district = u'Orléans' email = councillor['Email'] address = ', '.join([councillor['Address line 1'], councillor['Address line 2'], councillor['Locality'], councillor['Postal code'], councillor['Province']]) phone = councillor['Phone'] photo_url = councillor['Photo URL'] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_CSV_URL) p.add_contact('email', email, None) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.image = photo_url yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table/tbody/tr/td') for councillor in councillors: text = councillor.xpath('.//strong/text()')[0] name = text.split(',')[0].replace('Name:', '').strip() if 'Mayor' in text and not 'Deputy Mayor' in text: role = 'Mayor' district = 'Fredericton' else: district = re.findall(r'(Ward:.*)(?=Address:)', councillor.text_content())[0].replace(':', '').strip() district = re.search('\((.+?)(?: Area)?\)', district).group(1) role = 'Councillor' p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('.//img/@src')[0] address = re.findall(r'(?<=Address:).*(?=Home:)', councillor.text_content())[0].strip() p.add_contact('address', address, 'legislature') phone = re.findall(r'(?<=Home: \().*(?=Fax:)', councillor.text_content())[0] phone = re.sub(r'(?<=[0-9])(\)\D{1,2})(?=[0-9])', '-', phone).split()[0] p.add_contact('voice', phone, 'residence') phone = re.findall(r'(?<=Office: \().*(?=Fax:)', councillor.text_content()) if phone: phone = phone[0].replace(') ', '-') p.add_contact('voice', phone, 'legislature') yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="content"]//tr') for i, councillor in enumerate(councillors): if 'Maire' in councillor.text_content(): name = councillor.xpath('./td')[1].text_content() district = 'Sainte-Anne-de-Bellevue' role = 'Maire' else: name = councillor.xpath('./td')[1].text_content() district = 'District ' + re.findall( r'\d', councillor.xpath('./td')[0].text_content())[0] role = 'Conseiller' p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) email = councillor.xpath('.//a') if email: email = email[0].attrib['href'].replace('mailto:', '') p.add_contact('email', email, None) yield p
def scrape_mayor(self): page = lxmlize(MAYOR_PAGE, 'iso-8859-1') name = page.xpath( '//div[@class="articletitle"]/h1')[0].text_content().replace( 'Mayor', '') p = Legislator(name=name, post_id='Summerside', role='Mayor') p.add_source(MAYOR_PAGE) p.image = page.xpath( '//div[@class="articlebody-inside"]/p/img/@src')[0].replace( '..', '') info = page.xpath('//div[@class="articlebody-inside"]/p') phone = re.findall(r'to (.*)', info[1].text_content())[0] address = info[3].text_content().replace( 'by mail: ', '') + ' ' + info[4].text_content() email = info[5].xpath( './/a[contains(@href, "mailto:")]')[0].text_content() p.add_contact('voice', phone, 'legislature') p.add_contact('address', address, 'legislature') p.add_contact('email', email, None) return p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="WebPartWPQ1"]/table/tbody/tr[1]') for councillor in councillors: node = councillor.xpath(".//td[1]//strong//strong//strong//strong") or councillor.xpath(".//td[1]//strong") text = node[0].text_content() name = text.strip().replace("Deputy ", "").replace("Warden ", "").replace("Mayor", "") role = text.replace(name, "").strip() if not role: role = "Councillor" if "," in name: name = name.split(",")[0].strip() district = councillor.xpath('.//td[1]//p[contains(text(),",")]/text()')[0].split(",")[1].strip() district = re.sub(r"\A(?:City|Municipality|Town|Township|Village) of\b| Township\Z", "", district) p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath(".//td[1]//img/@src")[0] info = councillor.xpath(".//td[2]")[0].text_content() residential_info = re.findall(r"(?<=Residence:)(.*)(?=Municipal Office:)", info, flags=re.DOTALL)[0] self.get_contacts(residential_info, "residence", p) municipal_info = re.findall(r"(?<=Municipal Office:)(.*)", info, flags=re.DOTALL)[0] self.get_contacts(municipal_info, "legislature", p) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//ul[@class="subNav top"]/li/ul//li/a') for councillor in councillors: name = councillor.text_content() url = councillor.attrib['href'] page = lxmlize(url) if councillor == councillors[0]: district = 'Ajax' role = 'Mayor' else: district = re.findall(r'Ward.*', page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content())[0].strip() role = page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content() role = re.findall('((Regional)? ?(Councillor))', role)[0][0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('//div[@class="intQuicklinksPhoto"]/img/@src')[0] contact_info = page.xpath('//table[@class="datatable"][1]//tr')[1:] for line in contact_info: contact_type = line.xpath('./td')[0].text_content().strip() contact = line.xpath('./td')[1].text_content().strip() if re.match(r'(Phone)|(Fax)|(Email)', contact_type): contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type] p.add_contact(contact_type, contact, None if contact_type == 'email' else 'legislature') else: p.add_link(contact, None) yield p
def scrape_mayor(url): page = lxmlize(url) name = page.xpath('//tr/td/p')[-1] name = name.text_content().replace('Mayor', '') image = page.xpath('//div[@class="sask_ArticleBody"]//img/@src')[0] contact_url = page.xpath( '//a[contains(text(), "Contact the Mayor")]/@href')[0] page = lxmlize(contact_url) address = ' '.join( page.xpath( '//div[@id="ctl00_PlaceHolderMain_RichHtmlField1__ControlWrapper_RichHtmlField"]/p[4]/text()' )[1:]) phone = page.xpath( '//div[@id="ctl00_PlaceHolderMain_RichHtmlField1__ControlWrapper_RichHtmlField"]/p[5]/span/text()' )[0].replace('(', '').replace(') ', '-') fax = page.xpath( '//div[@id="ctl00_PlaceHolderMain_RichHtmlField1__ControlWrapper_RichHtmlField"]/p[6]/span/text()' )[0].replace('(', '').replace(') ', '-') p = Legislator(name=name, post_id='Saskatoon', role='Mayor') p.add_source(url) p.image = image p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') return p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="article-content"]//td[@class="ms-rteTableOddCol-0"]') yield scrape_mayor(councillors[0]) for councillor in councillors[1:]: if not councillor.xpath('.//a'): continue name = councillor.xpath('.//a')[0].text_content().strip() district = councillor.xpath('.//a')[1].text_content() url = councillor.xpath('.//a/@href')[0] page = lxmlize(url) p = Legislator(name=name, post_id=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = councillor.xpath('./preceding-sibling::td//img/@src')[-1] contacts = page.xpath('.//td[@class="ms-rteTableOddCol-0"]//text()') for contact in contacts: if re.findall(r'[0-9]', contact): phone = contact.strip().replace(' ', '-') p.add_contact('voice', phone, 'legislature') get_links(p, page.xpath('.//td[@class="ms-rteTableOddCol-0"]')[0]) email = page.xpath( 'string(//a[contains(@href, "mailto:")]/@href)')[len('mailto:'):] p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@align="center" and not(@class="background")]//td/p') for councillor in councillors: if not councillor.text_content().strip(): continue name = councillor.xpath('./font/b/text()') if not name: name = councillor.xpath('./font/text()') if 'e-mail' in name[0]: name = councillor.xpath('./b/font/text()') name = name[0] role = 'Councillor' if 'Mayor' in name: name = name.replace('Mayor', '') role = 'Mayor' p = Legislator(name=name, post_id="LaSalle", role=role) p.add_source(COUNCIL_PAGE) photo_url = councillor.xpath('./parent::td//img/@src')[0] p.image = photo_url email = councillor.xpath('.//a[contains(@href, "mailto:")]/text()')[0] p.add_contact('email', email, None) phone = re.findall(r'(?<=phone:)(.*)(?=home)', councillor.text_content(), flags=re.DOTALL) if phone: p.add_contact('voice', phone[0].strip(), 'legislature') home_phone = re.findall(r'(?<=home phone:)(.*)', councillor.text_content(), flags=re.DOTALL)[0] p.add_contact('voice', home_phone.strip(), 'residence') yield p
def get_people(self): page = lxmlize( COUNCIL_PAGE, user_agent= 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)') councillors = page.xpath('//table[last()]//tr/td[1]//strong') for i, councillor in enumerate(councillors): name = councillor.text_content().strip() if not name: continue if 'maire' in name: name = name.split('maire')[1].strip() district = u'Montréal-Est' else: district = councillor.xpath( './ancestor::td/following-sibling::td//strong' )[-1].text_content() district = 'District %s' % re.sub('\D+', '', district) email = councillor.xpath( './ancestor::tr/following-sibling::tr//a[contains(@href, "mailto:")]' )[0].text_content().strip() role = 'Maire' if i == 0 else 'Conseiller' p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE, 'iso-8859-1') general_contacts = page.xpath('//p[@class="large_title"]/following-sibling::p/text()') general_phone = general_contacts[0] general_fax = general_contacts[1] councillors = page.xpath('//tr/td/p/strong') councillors = [councillor for councillor in councillors if not "@" in councillor.text_content()] for councillor in councillors: if 'Mayor' in councillor.text_content(): name = councillor.text_content().replace('Mayor', '') district = 'Dollard-Des Ormeaux' role = 'Maire' else: name = re.split(r'[0-9]', councillor.text_content())[1] district = 'District ' + re.findall(r'[0-9]', councillor.text_content())[0] role = 'Conseiller' p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('./parent::p/parent::td/parent::tr/preceding-sibling::tr//img/@src')[0] email = councillor.xpath('./parent::p/following-sibling::p//a[contains(@href, "mailto:")]') if email: p.add_contact('email', email[0].text_content(), None) p.add_contact('voice', general_phone, 'legislature') p.add_contact('fax', general_fax, 'legislature') yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) mayor_url = page.xpath('//a[contains(text(), "Mayor")]/@href')[0] yield self.scrape_mayor(mayor_url) councillors_url = page.xpath('//a[contains(text(), "Councillors")]/@href')[0] cpage = lxmlize(councillors_url) councillor_rows = cpage.xpath('//tr[td//img]')[:-1] for councillor_row in councillor_rows: img_cell, info_cell = tuple(councillor_row) name = info_cell.xpath( 'string(.//span[contains(text(), "Councillor")])')[len('Councillor '):] district = info_cell.xpath('string(.//p[contains(text(), "District")])') email = info_cell.xpath('string(.//a[contains(@href, "mailto:")])') if not email: email = info_cell.xpath('string(.//strong[contains(text(), "E-mail")]/following-sibling::text())') phone = info_cell.xpath( 'string(.//p[contains(.//text(), "Telephone:")])').split(':')[1] img_url_rel = img_cell.xpath('string(//img/@href)') img_url = urljoin(councillors_url, img_url_rel) p = Legislator(name=name, post_id=district, role='Conseiller') p.add_source(COUNCIL_PAGE) p.add_source(councillors_url) p.add_contact('email', email, None) p.add_contact('voice', phone, 'legislature') p.image = img_url yield p
def get_people(self): member_parties = dict(process_parties(lxmlize(PARTY_PAGE))) page = lxmlize(COUNCIL_PAGE) for row in page.xpath('//table[not(@id="footer")]/tr')[1:]: name, district, _, email = [ cell.xpath('string(.)').replace(u'\xa0', u' ') for cell in row ] phone = row[2].xpath('string(text()[1])') try: photo_page_url = row[0].xpath('./a/@href')[0] except IndexError: continue # there is a vacant district photo_page = lxmlize(photo_page_url) photo_url = photo_page.xpath('string(//table//img/@src)') district = district.replace(' - ', u'—') # m-dash party = get_party(member_parties[name.strip()]) p = Legislator(name=name, post_id=district, role='MHA', party=party, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_source(photo_page_url) p.add_contact('email', email, None) # TODO: either fix phone regex or tweak phone value p.add_contact('voice', phone, 'legislature') yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) member_cells = page.xpath( '//div[@class="views-field views-field-field-picture"]/' 'parent::td') for cell in member_cells: name = cell[1].text_content().replace(' .', '. ') # typo on page riding = cell[2].text_content() if 'Mackenzie Delta' in riding: riding = 'Mackenzie-Delta' detail_url = cell[0].xpath('string(.//a/@href)') detail_page = lxmlize(detail_url) photo_url = detail_page.xpath( 'string(//div[@class="field-item even"]/img/@src)') email = detail_page.xpath('string(//a[contains(@href, "mailto:")])') contact_text = detail_page.xpath( 'string(//div[@property="content:encoded"]/p[1])') phone = re.search(r'P(hone)?: ([-0-9]+)', contact_text).group(2) p = Legislator(name=name, post_id=riding, role='MLA', image=photo_url) p.add_source(COUNCIL_PAGE) p.add_source(detail_url) p.add_contact('email', email, None) p.add_contact('voice', phone, 'legislature') yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) table = page.cssselect('table')[0] rows = table.cssselect('tr')[1:] assert len(rows) == 27 # There should be 27 districts for row in rows: districtnumcell, districtcell, membercell, dummy2 = row.cssselect( 'td') district_name = districtcell.cssselect( 'a')[0].text_content().strip() district = district_name.replace(' - ', '-') name = (membercell.cssselect('a')[0].text_content().replace( 'Hon. ', '').replace(' (LIB)', '').replace(' (PC)', '').strip()) url = membercell.cssselect('a')[0].get('href') email, phone, photo_url = scrape_extended_info(url) p = Legislator(name=name, post_id=district, role='MLA', image=photo_url) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email, None) p.add_contact('voice', phone, 'legislature') yield p
def scrape_mayor(self, url): infos_page = lxmlize(url) infos = infos_page.xpath('//div[@class="item-page"]')[0] name = ' '.join(infos.xpath('p[2]/text()')[0].split(' ')[2:4]) lname = name.lower() email = lname.split(' ')[0][0] + lname.split( ' ')[1] + '@langleycity.ca' photo_url = infos.xpath('p[1]/img/@src')[0] p = Legislator(name=name, post_id='Langley', role='Mayor', image=photo_url) p.add_source(url) p.add_contact('email', email, None) personal_infos = infos.xpath('p[last()]/text()') phone = re.findall(r'Phone(:?) (.*)', '\n'.join(personal_infos))[0][1] address = re.findall(r'Address: (.*) Phone', ' '.join(personal_infos))[0] p.add_contact('address', address, 'office') p.add_contact('voice', phone, 'office') return p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="img_four"][1]/div[1]') councillors = councillors + page.xpath( '//div[@class="img_four"][2]/div') for councillor_elem in councillors: name, position = councillor_elem.xpath('string(./p/strong)').split( ',') position = position.strip() if ' ' in position: position, post_id = position.split(' ', 1) post_id = post_number(post_id) else: post_id = 'Wellesley' addr = '\n'.join( addr_str.strip() for addr_str in councillor_elem.xpath('./p/text()')).strip() phone = councillor_elem.xpath( 'string(.//a[starts-with(@href, "tel:")])') email = councillor_elem.xpath( 'string(.//a[starts-with(@href, "mailto:")])') image = councillor_elem.xpath('string(.//img[1]/@src)') p = Legislator(name=name, post_id=post_id, role=position, image=image) p.add_source(COUNCIL_PAGE) p.add_contact('address', addr, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) yield p
def mayor_data(url): page = lxmlize(url) # TODO: Consider getting photo. It's on a separate page. name_text = page.xpath('//p[contains(text(), "Worship Mayor")]/text()')[0] name = ' '.join(name_text.split()[3:]) # TODO: probably too brittle email = page.xpath('//a[contains(@href, "mailto")]/text()')[0] p = Legislator(name=name, post_id='Mississauga', role='Mayor') p.add_source(url) p.add_contact('email', email, None) return p
def scrape_mayor(url): page = lxmlize(url) name = page.xpath('//div[@id="printAreaContent"]/h1/strong/text()')[0].replace('Mayor', '').strip() address = page.xpath('//strong[contains(text(), "mail")]/parent::p/text()')[1].replace(':', '').strip() phone = page.xpath('//strong[contains(text(), "phone")]/parent::p/text()')[1].split()[1] p = Legislator(name=name, post_id='Caledon', role='Mayor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('//h2[contains(text(), "About me")]/img/@src')[0] p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') return p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillor_trs = [ tr for tr in page.xpath('//table//tr[1]') if len(tr) == 2 ][:-1] for councillor_tr in councillor_trs: desc = [ text.strip() for text in councillor_tr.xpath('.//text()[normalize-space()]') if text.strip() ] if len(desc) == 3: role = 'Maire' district = u'Saint-Jérôme' else: role = 'Conseiller' district = desc[0].replace(u'numéro ', '') name = desc[-3] phone = desc[-2] email = desc[-1] image = councillor_tr.xpath('string(.//img/@src)')[0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.image = image p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillor_links = page.xpath('//li[@id="pageid2117"]/ul/li/a')[2:10] for link in councillor_links: if not link.text.startswith('Councillor'): continue url = link.attrib['href'] page = lxmlize(url) mail_link = page.xpath('//a[@title]')[0] name = mail_link.attrib['title'] email = mail_link.attrib['href'][len('mailto:'):] photo_url = page.xpath( 'string(//div[@class="pageContent"]//img[@align="right"]/@src)' ) p = Legislator(name=name, post_id='Abbotsford', role='Councillor', image=photo_url) p.add_source(url) p.add_contact('email', email, None) yield p page = lxmlize(MAYOR_URL) name = page.xpath('string(//h1)').split(' ', 1)[1] photo_url = page.xpath('string(//img[@hspace=10]/@src)') # email is hidden behind a form p = Legislator(name=name, post_id='Abbotsford', role='Mayor', image=photo_url) p.add_source(MAYOR_URL) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillor_links = page.xpath('//li[@id="pageid2117"]/ul/li/a')[2:10] for link in councillor_links: if not link.text.startswith('Councillor'): continue url = link.attrib['href'] page = lxmlize(url) mail_link = page.xpath('//a[@title]')[0] name = mail_link.attrib['title'] email = mail_link.attrib['href'][len('mailto:'):] photo_url = page.xpath('string(//div[@class="pageContent"]//img[@align="right"]/@src)') p = Legislator(name=name, post_id='Abbotsford', role='Councillor', image=photo_url) p.add_source(url) p.add_contact('email', email, None) yield p page = lxmlize(MAYOR_URL) name = page.xpath('string(//h1)').split(' ', 1)[1] photo_url = page.xpath('string(//img[@hspace=10]/@src)') # email is hidden behind a form p = Legislator(name=name, post_id='Abbotsford', role='Mayor', image=photo_url) p.add_source(MAYOR_URL) yield p
def get_people(self): # mayor first, can't find email page = lxmlize(MAYOR_URL) photo_url = page.xpath('string(//img/@src[contains(., "Maire")])') name = page.xpath('string(//td[@class="contenu"]/text()[last()])') p = Legislator(name=name, post_id=u"Trois-Rivières", role="Maire", image=photo_url) p.add_source(MAYOR_URL) yield p resp = requests.get(COUNCIL_PAGE) # page rendering through JS on the client page_re = re.compile(r'createItemNiv3.+"District (.+?)".+(index.+)\\"') for district, url_rel in page_re.findall(resp.text): if district not in ("des Estacades", "des Plateaux", "des Terrasses", "du Sanctuaire"): district = re.sub("\A(?:de(?: la)?|des|du) ", "", district) url = urljoin(COUNCIL_PAGE, url_rel) page = lxmlize(url) name = page.xpath("string(//h2)") email = page.xpath('string(//a/@href[contains(., "mailto:")])')[len("mailto:") :] photo_url = page.xpath('string(//img/@src[contains(., "Conseiller")])') p = Legislator(name=name, post_id=district, role="Conseiller", image=photo_url) p.add_source(url) p.add_contact("email", email, None) yield p
def get_people(self): # mayor first, can't find email page = lxmlize(MAYOR_URL) photo_url = page.xpath('string(//img/@src[contains(., "Maire")])') name = page.xpath('string(//td[@class="contenu"]/text()[last()])') p = Legislator(name=name, post_id=u"Trois-Rivières", role="Maire", image=photo_url) p.add_source(MAYOR_URL) yield p resp = requests.get(COUNCIL_PAGE) # page rendering through JS on the client page_re = re.compile(r'createItemNiv3.+"District (.+?)".+(index.+)\\"') for district, url_rel in page_re.findall(resp.text): if district not in ('des Estacades', 'des Plateaux', 'des Terrasses', 'du Sanctuaire'): district = re.sub('\A(?:de(?: la)?|des|du) ', '', district) url = urljoin(COUNCIL_PAGE, url_rel) page = lxmlize(url) name = page.xpath('string(//h2)') email = page.xpath( 'string(//a/@href[contains(., "mailto:")])')[len('mailto:'):] photo_url = page.xpath( 'string(//img/@src[contains(., "Conseiller")])') p = Legislator(name=name, post_id=district, role='Conseiller', image=photo_url) p.add_source(url) p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillor_trs = [tr for tr in page.xpath('//table//tr[1]') if len(tr) == 2][:-1] for councillor_tr in councillor_trs: desc = [text.strip() for text in councillor_tr.xpath('.//text()[normalize-space()]') if text.strip()] if len(desc) == 3: role = 'Maire' district = u'Saint-Jérôme' else: role = 'Conseiller' district = desc[0].replace(u'numéro ', '') name = desc[-3] phone = desc[-2] email = desc[-1] image = councillor_tr.xpath('string(.//img/@src)')[0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.image = image p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//p[@class="WSIndent"]/a') for councillor in councillors: district = re.findall(r'(Ward [0-9]{1,2})', councillor.text_content()) if district: district = district[0] name = councillor.text_content().replace(district, '').strip() role = 'Councillor' else: district = 'Kawartha Lakes' name = councillor.text_content().replace('Mayor', '').strip() role = 'Mayor' url = councillor.attrib['href'] page = lxmlize(url) email = page.xpath('//a[contains(@href, "mailto:")]/@href')[0].rsplit(':', 1)[1].strip() image = page.xpath('//img[@class="image-right"]/@src')[0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email, None) p.image = image yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE, user_agent='Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)') yield self.scrape_mayor(page) councillors = page.xpath('//strong[contains(text(), "Councillor")]/parent::p|//b[contains(text(), "Councillor")]/parent::p') for councillor in councillors: name = councillor.xpath('./strong/text()|./b/text()')[0].replace('Councillor', '').strip() district = re.findall('(?<=Ward \d, ).*', councillor.text_content())[0].strip() p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.image = councillor.xpath('.//img/@src')[0] phone = re.findall(r'Phone(.*)', councillor.text_content()) node = councillor while not phone: node = node.xpath('./following-sibling::p')[1] phone = re.findall(r'Phone(.*)', node.text_content()) phone = phone[0].strip() email = councillor.xpath('.//a[contains(@href, "mailto:")]') if not email: email = councillor.xpath('./following-sibling::p//a[contains(@href, "mailto")]') email = email[0].text_content() if len(re.sub(r'\D', '', phone)) == 7: phone = '902-%s' % phone p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE, 'iso-8859-1') councillors = page.xpath('//div[@id="PageContent"]/table/tbody/tr/td') for councillor in councillors: if not councillor.text_content().strip(): continue if councillor == councillors[0]: district = 'Kirkland' role = 'Maire' else: district = councillor.xpath('.//h2')[0].text_content() district = re.search('- (.+)', district).group(1).strip() district = district.replace(' Ouest', ' ouest').replace(' Est', ' est') role = 'Conseiller' name = councillor.xpath('.//strong/text()')[0] phone = councillor.xpath('.//div[contains(text(), "#")]/text()')[0].replace('T ', '').replace(' ', '-').replace(',-#-', ' x') email = councillor.xpath('.//a[contains(@href, "mailto:")]')[0].text_content() p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) p.image = councillor.xpath('.//img/@src')[0] yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//p[@class="WSIndent"]/a') for councillor in councillors: district = re.findall(r'(Ward [0-9]{1,2})', councillor.text_content()) if district: district = district[0] name = councillor.text_content().replace(district, '').strip() role = 'Councillor' else: district = 'Kawartha Lakes' name = councillor.text_content().replace('Mayor', '').strip() role = 'Mayor' url = councillor.attrib['href'] page = lxmlize(url) email = page.xpath( '//a[contains(@href, "mailto:")]/@href')[0].rsplit( ':', 1)[1].strip() image = page.xpath('//img[@class="image-right"]/@src')[0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email, None) p.image = image yield p
def mayor_data(url, name): page = lxmlize(url) photo_url = urljoin(url, page.xpath('string((//div[@id="contentcontainer"]//img)[1]/@src)')) contact_page = lxmlize(MAYOR_CONTACT_URL) email = contact_page.xpath('string(//a[contains(., "@")][1])') m = Legislator(name=name, post_id='Regina', role='Mayor') m.add_source(COUNCIL_PAGE) m.add_source(url) m.add_source(MAYOR_CONTACT_URL) m.add_contact('email', email, None) m.image = photo_url return m
def councillor_data(url, name, ward): page = lxmlize(url) # email is, sadly, a form photo_url = urljoin(url, page.xpath('string(//img[@class="bio_pic"]/@src)')) phone = page.xpath('string(//td[contains(., "Phone")]/following-sibling::td)') email = (page.xpath('string(//tr[contains(., "Email")]//a/@href)'). split('=')[1] + '@winnipeg.ca') p = Legislator(name=name, post_id=ward, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email, None) p.add_contact('voice', phone, 'legislature') p.image = photo_url return p
def mayor_data(url): page = lxmlize(url) name = page.xpath('string(//h1)').split('-')[1] content_node = page.xpath('//div[@class="usercontent"]')[0] photo_url = urljoin(url, content_node.xpath('string(.//img[1]/@src)')) email = content_node.xpath('string(.//a/text()[contains(., "@")])') phone = content_node.xpath('string(.//strong[contains(., "Phone")]/' 'following-sibling::text()[1])').strip() p = Legislator(name=name, post_id='Strathcona County', role='Mayor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) p.image = photo_url return p
def mayor_data(url, name): page = lxmlize(url) photo_url = urljoin( url, page.xpath('string((//div[@id="contentcontainer"]//img)[1]/@src)')) contact_page = lxmlize(MAYOR_CONTACT_URL) email = contact_page.xpath('string(//a[contains(., "@")][1])') m = Legislator(name=name, post_id='Regina', role='Mayor') m.add_source(COUNCIL_PAGE) m.add_source(url) m.add_source(MAYOR_CONTACT_URL) m.add_contact('email', email, None) m.image = photo_url return m
def get_people(self): page = lxmlize(COUNCIL_PAGE) for link in page.xpath('//div[@class="section"]//a'): url = link.attrib['href'] if url.endswith('address.html'): continue page = lxmlize(url) role, name = page.xpath('string(//div[@id="content"]/h1)').split(' ', 1) name = ' '.join(name.split()[:-1]) photo_url = page.xpath('string(//img[@class="float-right"]/@src)') email = page.xpath('string(//a[starts-with(@href, "mailto:")])') p = Legislator(name=name, post_id='Saanich', role=role, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_contact('email', email, None) yield p
def councillor_data(url, name, role): page = lxmlize(url) email = page.xpath('string(//a[contains(@href, "mailto")])') phone_str = page.xpath('string(//div[@id="content"]//strong[1]/' 'following-sibling::text()[contains(., "Phone")])') phone = phone_str.split(':')[1] photo_url = urljoin( url, page.xpath('string(//div[@id="content"]//img[1]/@src)')) # TODO: should post_id be "Nieghborhood Liaison"? m = Legislator(name=name, post_id='Victoria', role=role) m.add_source(COUNCIL_PAGE) m.add_source(url) m.add_contact('email', email, None) m.add_contact('voice', phone, 'legislature') m.image = photo_url return m
def get_people(self): page = lxmlize(COUNCIL_PAGE, 'iso-8859-1') mayor_url = page.xpath('//li[@id="pageid193"]//a/@href')[0] yield scrape_mayor(mayor_url) councillors = page.xpath('//td[@class="cityfonts"]') for councillor in councillors: try: name = councillor.xpath('.//a')[0].text_content() except IndexError: continue districts = [ x.strip() for x in councillor.xpath('.//span/text()') if re.sub(u'\xa0', ' ', x).strip() ] district = districts[1] if district == 'At Large': district = 'Moncton' elif district == 'Deputy Mayor': district = districts[2] url = councillor.xpath('.//a')[-1].attrib['href'] page = lxmlize(url) p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = councillor.xpath('.//img/@src')[0] email = page.xpath('string(.//a[contains(@href, "mailto:")]/@href)' )[len('mailto:'):] p.add_contact('email', email, None) contact_info = page.xpath( './/table[@class="whiteroundedbox"]//td/p[contains(text()," ")]' )[0].text_content() phone_nos = re.findall(r'(([0-9]{3}-)?([0-9]{3}-[0-9]{4}))', contact_info) for phone_no in phone_nos: if len(re.sub(r'\D', '', phone_no[0])) == 7: phone = '506-%s' % phone_no[0] else: phone = phone_no[0] p.add_contact('voice', phone, 'legislature') yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) for row in page.xpath('//div[@id="content"]/table/tbody/tr'): full_name, party_abbr, post = row.xpath('./td//text()')[:3] name = ' '.join(reversed(full_name.split(','))) detail_url = row[0][0].attrib['href'] image, phone, email = get_details(detail_url) p = Legislator(name=name, post_id=post, role='MLA', party=get_party(party_abbr), image=image) p.add_source(COUNCIL_PAGE) p.add_source(detail_url) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) for councillor_row in page.xpath('//tr'): post = councillor_row.xpath('string(./td[2]/p/text())') if post == 'Maire de Laval': district = 'Laval' role = 'Maire' else: district = re.sub('^C.?irconscription (?:no )?\d+\D- ', '', post).replace("L'", '').replace(' ', '').replace( 'bois', 'Bois') role = 'Conseiller' full_name = councillor_row.xpath( 'string(./td[2]/p/text()[2])').strip() name = ' '.join(full_name.split()[1:]) phone = councillor_row.xpath( 'string(.//span[@class="icon-phone"]/following::text())') email = councillor_row.xpath( 'string(.//a[contains(@href, "mailto:")]/@href)')[len('mailto:' ):] photo_url = councillor_row[0][0].attrib['src'] p = Legislator(name=name, post_id=district, role=role, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) yield p
def get_people(self): member_page = lxmlize(COUNCIL_PAGE) table = member_page.xpath('//table')[0] rows = table.cssselect('tr')[1:] for row in rows: (namecell, constitcell, partycell) = row.cssselect('td') full_name = namecell.text_content().strip() if full_name.lower() == 'vacant': continue (last, first) = full_name.split(',') name = first.replace('Hon.', '').strip() + ' ' + last.title().strip() district = ' '.join(constitcell.text_content().split()) party = get_party(partycell.text) data = { 'elected_office': 'MLA', 'source_url': COUNCIL_PAGE } url = namecell.cssselect('a')[0].get('href') photo, email = get_details(url) p = Legislator(name=name, post_id=district, role='MLA', party=party, image=photo) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) mayor_info = page.xpath('//h2[contains(text(), "MAYOR")]//following-sibling::p')[0] yield self.scrape_mayor(mayor_info) wards = page.xpath('//h3') for ward in wards: district = re.sub('\AWARD \d+ - ', '', ward.text_content()) councillors = ward.xpath('following-sibling::p') for councillor in councillors: name = councillor.xpath('./strong')[0].text_content() p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) info = councillor.xpath('./text()') address = info.pop(0) p.add_contact('address', address, 'legislature') # get phone numbers for line in info: stuff = re.split(ur'(\xbb)|(\xa0)', line) tmp = [y for y in stuff if y and not re.match(ur'\xa0', y)] self.get_tel_numbers(tmp, p) email = councillor.xpath('string(./a)') p.add_contact('email', email, None) yield p if councillor == councillors[1]: break
def get_people(self): member_page = lxmlize(COUNCIL_PAGE) table = member_page.xpath('//table')[0] rows = table.cssselect('tr')[1:] for row in rows: (namecell, constitcell, partycell) = row.cssselect('td') full_name = namecell.text_content().strip() if full_name.lower() == 'vacant': continue (last, first) = full_name.split(',') name = first.replace('Hon.', '').strip() + ' ' + last.title().strip() district = ' '.join(constitcell.text_content().split()) party = get_party(partycell.text) data = {'elected_office': 'MLA', 'source_url': COUNCIL_PAGE} url = namecell.cssselect('a')[0].get('href') photo, email = get_details(url) p = Legislator(name=name, post_id=district, role='MLA', party=party, image=photo) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillor_elems = page.xpath('//a[contains(@class, "slide item-")]') email_links = page.xpath('//a[contains(@href, "mailto:")]') for elem in councillor_elems: name_elem = elem.xpath('.//strong')[0] name = re.search('(Mr\. )?(.+)', name_elem.text).group(2) position = name_elem.xpath('string(following-sibling::text())') role = 'Conseiller' if 'Mayor' in position: district = 'Brossard' role = 'Maire' else: district = re.sub(r'(?<=[0-9]).+', '', position).strip() photo = re.search(r'url\((.+)\)', elem.attrib['style']).group(1) p = Legislator(name=name, post_id=district, role=role, image=photo) p.add_source(COUNCIL_PAGE) try: email_elem = [link for link in email_links if name in link.text_content().replace(u'\u2019', "'")][0] email = re.match('mailto:([email protected])', email_elem.attrib['href']).group(1) p.add_contact('email', email, None) phone = email_elem.xpath( './following-sibling::text()[contains(., "450")]')[0] p.add_contact('voice', phone, 'legislature') except IndexError: # oh Francyne/Francine Raymond, who are you, really? pass yield p
def mayor_data(page): # Strip the word "mayor" from the beginning of the photo lavel photo_node = page.xpath('//img[@class="mayorsPic"]')[0] name = photo_node.xpath('string(./@alt)').replace('Mayor ', '') photo_url = photo_node.xpath('string(./@src)') address_node = page.xpath('//div[@class="address"]')[0] email = address_node.xpath('string(.//a)') address = ''.join(address_node.xpath('./p/text()')[:3]) phone = address_node.xpath('string(./p/text()[4])') p = Legislator(name=name, post_id='Brampton', role='Mayor') p.add_source(MAYOR_PAGE) p.add_contact('voice', phone, 'legislature') p.add_contact('address', address, 'legislature') p.add_contact('email', email, None) p.image = photo_url return p
def councillor_data(url, name, role): page = lxmlize(url) email = page.xpath('string(//a[contains(@href, "mailto")])') phone_str = page.xpath('string(//div[@id="content"]//strong[1]/' 'following-sibling::text()[contains(., "Phone")])') phone = phone_str.split(':')[1] photo_url = urljoin(url, page.xpath('string(//div[@id="content"]//img[1]/@src)')) # TODO: should post_id be "Nieghborhood Liaison"? m = Legislator(name=name, post_id='Victoria', role=role) m.add_source(COUNCIL_PAGE) m.add_source(url) m.add_contact('email', email, None) m.add_contact('voice', phone, 'legislature') m.image = photo_url return m
def scrape_mayor(url): page = lxmlize(url) name = page.xpath('//div[@class="interiorContentWrapper"]/p/strong/text()')[0] address = ' '.join(page.xpath('//div[@class="interiorContentWrapper"]/p/text()')[1:3]) address = re.sub(r'\s{2,}', ' ', address) contact_elem = page.xpath('//div[@class="interiorContentWrapper"]/p[3]')[0] phone = contact_elem.text.split(':')[1].strip() email = contact_elem.xpath('string(./a)') p = Legislator(name=name, post_id='Markham', role='Mayor') p.add_source(url) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) yield p
def scrape_person(url): page = lxmlize(url) role, name = page.xpath('string(//title)').split(' ', 1) photo_url = page.xpath('string(//div[@id="content"]//img[@style]/@src)') email = page.xpath('string(//a[contains(@href, "mailto:")])') phone = page.xpath('string(//li[contains(text(), "Phone:")])') p = Legislator(name=name, post_id='Burnaby', role=role, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('email', email, None) if phone: p.add_contact('voice', phone, 'legislature') return p
def get_people(self): page = lxmlize(COUNCIL_PAGE) for person_header_elem in page.xpath('//h2'): role, name_post = person_header_elem.text.split(' - ') try: name, caps_post = re.match(r'(.+) \((.+)\)', name_post).groups() post = caps_post.title() except AttributeError: name = name_post post = "Clarington" email = person_header_elem.xpath( 'string(./following-sibling::a[1]/@href)')[len('mailto:'):] photo_url = person_header_elem.xpath( 'string(./following-sibling::img[1]/@src)') p = Legislator(name=name, post_id=post, role=role, image=photo_url) p.add_source(COUNCIL_PAGE) p.add_contact('email', email, None) yield p
def councillor_data(url): page = lxmlize(url) name, ward = re.match('Councillor (.+) - (.+)', page.xpath('string(//h1)')).groups() content_node = page.xpath('//div[@class="usercontent"]')[0] photo_url_rel = content_node.xpath('string(.//img[1]/@src)') photo_url = urljoin(COUNCIL_PAGE, photo_url_rel) email = content_node.xpath('string(.//a/text()[contains(., "@")])') phone = content_node.xpath('string(.//strong[contains(., "Phone")]/' 'following-sibling::text()[1])').strip() p = Legislator(name=name, post_id=ward, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) if phone: p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) p.image = photo_url return p