def scrape_mayor(self, div): name = div.xpath('.//a')[0].text_content().replace('Mayor', '') url = div.xpath('.//a')[0].attrib['href'] p = Legislator(name=name, post_id='Guelph', role='Mayor') p.add_source(COUNCIL_PAGE) p.add_source(url) phone = div.xpath('.//text()[normalize-space()]')[2] email = div.xpath('.//a[contains(@href,"mailto:")]')[0].text_content() page = lxmlize(url) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) p.add_link( page.xpath( '//div[@class="entry-content"]//a[contains(@href, "facebook")]' )[0].attrib['href'], None) p.add_link( page.xpath( '//div[@class="entry-content"]//a[contains(@href, "twitter")]') [0].attrib['href'], None) p.image = page.xpath('//header/img/@src')[0] return p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//ul[@class="subNav top"]/li/ul//li/a') for councillor in councillors: name = councillor.text_content() url = councillor.attrib['href'] page = lxmlize(url) if councillor == councillors[0]: district = 'Ajax' role = 'Mayor' else: district = re.findall(r'Ward.*', page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content())[0].strip() role = page.xpath('//div[@id="printAreaContent"]//h1')[0].text_content() role = re.findall('((Regional)? ?(Councillor))', role)[0][0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath('//div[@class="intQuicklinksPhoto"]/img/@src')[0] contact_info = page.xpath('//table[@class="datatable"][1]//tr')[1:] for line in contact_info: contact_type = line.xpath('./td')[0].text_content().strip() contact = line.xpath('./td')[1].text_content().strip() if re.match(r'(Phone)|(Fax)|(Email)', contact_type): contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type] p.add_contact(contact_type, contact, None if contact_type == 'email' else 'legislature') else: p.add_link(contact, None) yield p
def get_people(self): reader = csv_reader(COUNCIL_PAGE, header=True) for row in reader: kwargs = {'role': 'candidate'} email = None links = [] extra = {} offices = [] for k, v in row.items(): v = v.strip() if not v: continue k = k.strip() match = re.search(r'\AOffice (\d): ', k) if match: index = int(match.group(1)) while index > len(offices): offices.append({}) if k[10:] == 'Type': offices[index - 1]['note'] = v elif k[10:] in CONTACT_TYPE_KEYS: offices[index - 1][CONTACT_TYPE_KEYS[k[10:]]] = v else: raise Exception(k) elif k == 'Party Name': kwargs['party'] = PARTY_MAP[v] elif k in KEYS: kwargs[KEYS[k]] = v elif k == 'Email': email = v elif k in LINKS_KEYS: links.append({'url': v, 'note': k}) elif k in IGNORE_KEYS: continue elif k in EXTRA_KEYS: extra[re.sub(r'[^a-z0-9_]', '', k.lower().replace(' ', '_'))] = v else: raise Exception(k) contacts = [] for office in offices: for _, type in CONTACT_TYPE_KEYS.items(): if office.get(type): contacts.push({'note': office['note'], type: type, 'value': office[type]}) if 'name' in kwargs: p = Legislator(**kwargs) p.add_source(COUNCIL_PAGE) if email: p.add_contact('email', email, None) for link in links: p.add_link(**links) for contact in contacts: p.add_contact(**contact) for k, v in extra.items(): p.add_extra(k, v) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="printArea"]//table//tr//td')[4:-1] yield self.scrape_mayor(councillors[0]) for councillor in councillors[1:]: name = ' '.join( councillor.xpath('string(.//strong/a[last()])').split()) infostr = councillor.xpath('string(.//strong)') try: district = infostr.split('-')[1] role = 'Councillor' except IndexError: district = 'Newmarket' role = 'Regional Councillor' url = councillor.xpath('.//a/@href')[0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = councillor.xpath('.//img/@src')[0] page = lxmlize(url) info = page.xpath('//div[@id="printArea"]')[0] info = info.xpath('.//p[@class="heading"][2]/following-sibling::p') address = info.pop(0).text_content().strip() if not address: address = info.pop(0).text_content().strip() if 'Ward' in info[0].text_content(): info.pop(0) numbers = info.pop(0).text_content().split(':') email = page.xpath('//a[contains(@href, "mailto:")]/text()')[0] p.add_contact('email', email, None) for i, contact in enumerate(numbers): if i == 0: continue if '@' in contact: continue # executive assistant email else: number = re.findall(r'([0-9]{3}-[0-9]{3}-[0-9]{4})', contact)[0] ext = re.findall(r'(Ext\. [0-9]{3,4})', contact) if ext: number = number + ext[0].replace('Ext. ', ' x') contact_type = re.findall(r'[A-Za-z]+$', numbers[i - 1])[0] if 'Fax' in contact_type: p.add_contact('fax', number, 'legislature') elif 'Phone' in contact_type: p.add_contact('voice', number, 'legislature') else: p.add_contact(contact_type, number, contact_type) site = page.xpath('.//a[contains(text(), "http://")]') if site: p.add_link(site[0].text_content(), None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@id="printArea"]//table//tr//td')[4:-1] yield self.scrape_mayor(councillors[0]) for councillor in councillors[1:]: name = ' '.join(councillor.xpath('string(.//strong/a[last()])').split()) infostr = councillor.xpath('string(.//strong)') try: district = infostr.split('-')[1] role = 'Councillor' except IndexError: district = 'Newmarket' role = 'Regional Councillor' url = councillor.xpath('.//a/@href')[0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = councillor.xpath('.//img/@src')[0] page = lxmlize(url) info = page.xpath('//div[@id="printArea"]')[0] info = info.xpath('.//p[@class="heading"][2]/following-sibling::p') address = info.pop(0).text_content().strip() if not address: address = info.pop(0).text_content().strip() if 'Ward' in info[0].text_content(): info.pop(0) numbers = info.pop(0).text_content().split(':') email = page.xpath('//a[contains(@href, "mailto:")]/text()')[0] p.add_contact('email', email, None) for i, contact in enumerate(numbers): if i == 0: continue if '@' in contact: continue # executive assistant email else: number = re.findall(r'([0-9]{3}-[0-9]{3}-[0-9]{4})', contact)[0] ext = re.findall(r'(Ext\. [0-9]{3,4})', contact) if ext: number = number + ext[0].replace('Ext. ', ' x') contact_type = re.findall(r'[A-Za-z]+$', numbers[i - 1])[0] if 'Fax' in contact_type: p.add_contact('fax', number, 'legislature') elif 'Phone' in contact_type: p.add_contact('voice', number, 'legislature') else: p.add_contact(contact_type, number, contact_type) site = page.xpath('.//a[contains(text(), "http://")]') if site: p.add_link(site[0].text_content(), None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) mayor_url = page.xpath('//a[contains(text(), "Office of the Mayor")]/@href')[0] yield scrape_mayor(mayor_url) councillors = page.xpath('//div[@class="interiorContentWrapper"]//td[./a]') for councillor in councillors: name = councillor.xpath('.//strong')[1].text_content().strip() district = councillor.xpath('.//a//text()[normalize-space()]')[0] if 'Ward' in district: district = district.replace('Councillor', '') role = 'Councillor' else: role = district district = 'Markham' image = councillor.xpath('.//img/@src')[0] url = councillor.xpath('.//a/@href')[0] if 'Ward 4' in district: yield scrape_4(name, url, image) continue page = lxmlize(url) p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = image contact = page.xpath('//div[@class="microSiteLinksWrapper"]')[1] if contact.xpath('.//p/text()'): infos = contact.xpath('.//p/text()') else: infos = contact.xpath('.//div/text()') address = re.sub(r'\s{2,}', ' ', ' '.join(infos[:2])).strip() phone = infos[2].split(':')[1].strip() email = contact.xpath('.//a[contains(@href,"mailto:")]/text()')[0] website = contact.xpath('.//a[not( contains(@href, "mailto:"))]/text()') if website: p.add_link(website[0], None) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) get_links(p, contact) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table[@id="MLAs"]//tr')[1:] for councillor in councillors: name = councillor.xpath('./td')[0].text_content().split('. ', 1)[1] party = councillor.xpath('./td')[1].text district = councillor.xpath('./td')[2].text_content() url = councillor.xpath('./td[1]/a/@href')[0] page = lxmlize(url) p = Legislator(name=name, post_id=district, role='MLA', party=party) p.add_source(COUNCIL_PAGE) p.add_source(url) contact = page.xpath('//table[@id="mla-contact"]//tr[2]')[0] website = contact.xpath('./td[3]//div[3]//a') if website: p.add_link(website[0].text_content(), None) p.add_contact('address', contact.xpath('./td[1]/div[2]')[0].text_content(), 'legislature') p.add_contact('address', ''.join(contact.xpath('./td[2]/div//text()')[1:7]), 'constituency') numbers = [ contact.xpath('./td[1]/div[3]')[0].text_content().split( ':')[1].strip(), contact.xpath('./td[2]/div[4]//span/text()')[0], contact.xpath('./td[1]/div[4]')[0].text_content().split(':') [1].strip(), contact.xpath('./td[2]/div[5]//span/text()')[0], ] for index, number in enumerate(numbers): if len(number) < 10: numbers[index] = '306-%s' % number p.add_contact('voice', numbers[0], 'legislature') p.add_contact('voice', numbers[1], 'constituency') p.add_contact('fax', numbers[2], 'legislature') p.add_contact('fax', numbers[3], 'constituency') p.add_contact( 'email', contact.xpath('./td[3]//a[contains(@href, "mailto:")]/text()') [0], None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) yield scrape_mayor() councillors = page.xpath('//div[@id="centre_content"]//tr') for councillor in councillors: if "Position" in councillor.text_content(): continue district = councillor.xpath("./td")[0].text_content().replace("Councillor", "") name = councillor.xpath("./td")[1].text_content() url = councillor.xpath("./td/a")[0].attrib["href"] p = Legislator(name=name, post_id=district, role="Councillor") p.add_source(COUNCIL_PAGE) p.add_source(url) page = lxmlize(url) address = page.xpath('//div[@id="centre_content"]//p')[0].text_content().replace("\r\n", ", ") email = page.xpath('//a[contains(@href,"mailto:")]')[0].attrib["href"].replace("mailto:", "") p.add_contact("address", address, "legislature") p.add_contact("email", email, None) p.image = page.xpath('//div[@id="centre_content"]//img/@src')[0] numbers = page.xpath('//div[@id="centre_content"]//p[contains(text(),"-")]')[0].text_content() if "tel" in numbers: phone = ( re.findall(r"(.*)tel", numbers)[0] .strip() .replace(" ", "-") .replace("\\xc2", "") .replace("\\xa0", "-") ) p.add_contact("voice", phone, "legislature") if "cell" in numbers: cell = re.findall(r"(.*)cell", numbers)[0].strip().replace(" ", "-") p.add_contact("cell", cell, "legislature") if "fax" in numbers: fax = re.findall(r"(.*)fax", numbers)[0].strip().replace(" ", "-") p.add_contact("fax", fax, "legislature") if len(page.xpath('//div[@id="centre_content"]//a')) > 2: p.add_link(page.xpath('//div[@id="centre_content"]//a')[-1].attrib["href"], None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//*[@class="two_third last"]') for councillor in councillors: if councillor == councillors[0]: yield self.scrape_mayor(councillor) continue name = councillor.xpath('.//a')[0].text_content().replace( 'Councillor', '').replace('Mayor', '') info = councillor.xpath('.//text()[normalize-space()]') district = info[2] url = councillor.xpath('.//a')[0].attrib['href'] p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('voice', info[3].replace('extension', 'x'), 'legislature') email = councillor.xpath('.//a[contains(@href,"mailto:")]') if email: email = email[0].text_content() p.add_contact('email', email, None) site = councillor.xpath('.//a[contains(text(),"Website")]') if site: p.add_link(site[0].attrib['href'], None) page = lxmlize(url) p.image = page.xpath('//header/img/@src')[0] address = re.findall( r'Address: (.*)Phone', page.xpath('//div[@class="entry-content"]')[0].text_content()) if address: p.add_contact('address', address[0], 'legislature') blog = page.xpath('//a[contains(text(),"Blog")]') if blog: p.add_link(blog[0].attrib['href'], None) facebook = page.xpath( '//div[@class="entry-content"]//a[contains(@href, "facebook")]' ) if facebook: p.add_link(facebook[0].attrib['href'], None) twitter = page.xpath( '//div[@class="entry-content"]//a[contains(@href, "twitter")]') if twitter: p.add_link(twitter[0].attrib['href'], None) yield p
def scrape_mayor(self, div): name = div.xpath('.//a')[0].text_content().replace('Mayor', '') url = div.xpath('.//a')[0].attrib['href'] p = Legislator(name=name, post_id='Guelph', role='Mayor') p.add_source(COUNCIL_PAGE) p.add_source(url) phone = div.xpath('.//text()[normalize-space()]')[2] email = div.xpath('.//a[contains(@href,"mailto:")]')[0].text_content() page = lxmlize(url) p.add_contact('voice', phone, 'legislature') p.add_contact('email', email, None) p.add_link(page.xpath('//div[@class="entry-content"]//a[contains(@href, "facebook")]')[0].attrib['href'], None) p.add_link(page.xpath('//div[@class="entry-content"]//a[contains(@href, "twitter")]')[0].attrib['href'], None) p.image = page.xpath('//header/img/@src')[0] return p
def get_people(self): page = lxmlize(COUNCIL_PAGE) mayor_contacts = page.xpath('//table[@class="nicEdit-visualClass"]//tr/td[1]/text()') council_contacts = page.xpath('//table[@class="nicEdit-visualClass"]//tr/td[2]/text()') councillors = page.xpath('//table[@id="Table3table"]//strong/ancestor::td') for councillor in councillors: name = councillor.xpath('.//strong/text()')[0] if 'Councillor' in name: name = name.replace('Councillor', '').strip() role_ward = councillor.xpath('./text()')[0] if not role_ward.strip(): role_ward = councillor.xpath('.//p/text()')[0] role_ward = role_ward.split(' ') role = re.sub('\ACity ', '', ' '.join(role_ward[:2])) ward = ' '.join(role_ward[2:]) else: name = councillor.xpath('.//strong/text()')[1] role = 'Mayor' ward = 'Pickering' email = councillor.xpath('.//a[contains(@href, "mailto:")]/text()')[0] p = Legislator(name=name, post_id=ward, role=role) p.add_source(COUNCIL_PAGE) p.add_contact('email', email, None) p.image = councillor.xpath('.//img/@src')[0] links = councillor.xpath('.//a') for link in links: if '@' in link.text_content(): continue if 'Profile' in link.text_content(): p.add_source(link.attrib['href']) else: p.add_link(link.attrib['href'], None) if role == 'Mayor': add_contacts(p, mayor_contacts) else: add_contacts(p, council_contacts) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//center/center//a') for councillor in councillors: name = councillor.text_content().strip() url = councillor.attrib['href'] page = lxmlize(url) header = page.xpath('//div[@class="sectionheading"]')[0].text_content() if header == 'Mayor of Richmond Hill': district = 'Richmond Hill' role = 'Mayor' else: district = re.findall(r',(.*)-', header) if district: district = district[0].strip() else: district = 'Richmond Hill' role = 'Regional Councillor' if 'Regional' in header else 'Councillor' info = page.xpath('//table[@cellpadding>0]/tbody/tr/td[last()]|//table[not(@cellpadding)]/tbody/tr/td[last()]') info = info[0].text_content().replace(' - office:', ':') address = re.findall(r'(?<=Town of Richmond Hill).*(?=Telephone)', info)[0] address = re.sub(r'([a-z])([A-Z])', r'\1 \2', address) phone = re.findall(r'(?<=Telephone:) (.*)(?=Fax)', info)[0].replace('(', '').replace(') ', '-').replace(', ext. ', ' x') fax = re.findall(r'(?<=Fax:) (.*)(?=E-mail)', info)[0].replace(' ', '').replace('(', '').replace(')', '-') email = page.xpath('.//a[contains(@href, "mailto:")]/@href')[0].replace('mailto:', '') p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('address', address, 'legislature') p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) p.image = page.xpath('//img[contains(@alt, "%s")]/@src' % name)[0] if 'Website' in info: p.add_link(re.findall(r'www\..*\.[a-z]+', info)[0], None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) yield scrape_mayor() councillors = page.xpath('//div[@id="centre_content"]//tr') for councillor in councillors: if 'Position' in councillor.text_content(): continue district = councillor.xpath('./td')[0].text_content().replace('Councillor', '') name = councillor.xpath('./td')[1].text_content() url = councillor.xpath('./td/a')[0].attrib['href'] p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) page = lxmlize(url) address = page.xpath('//div[@id="centre_content"]//p')[0].text_content().replace("\r\n", ', ') email = page.xpath('//a[contains(@href,"mailto:")]')[0].attrib['href'].replace('mailto:', '') p.add_contact('address', address, 'legislature') p.add_contact('email', email, None) p.image = page.xpath('//div[@id="centre_content"]//img/@src')[0] numbers = page.xpath('//div[@id="centre_content"]//p[contains(text(),"-")]')[0].text_content() if 'tel' in numbers: phone = re.findall(r'(.*)tel', numbers)[0].strip().replace(' ', '-').replace("\\xc2", '').replace("\\xa0", '-') p.add_contact('voice', phone, 'legislature') if 'cell' in numbers: cell = re.findall(r'(.*)cell', numbers)[0].strip().replace(' ', '-') p.add_contact('cell', cell, 'legislature') if 'fax' in numbers: fax = re.findall(r'(.*)fax', numbers)[0].strip().replace(' ', '-') p.add_contact('fax', fax, 'legislature') if len(page.xpath('//div[@id="centre_content"]//a')) > 2: p.add_link(page.xpath('//div[@id="centre_content"]//a')[-1].attrib['href'], None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//ul[@class="subNav top"]/li/ul//li/a') for councillor in councillors: name = councillor.text_content() url = councillor.attrib['href'] page = lxmlize(url) if councillor == councillors[0]: district = 'Ajax' role = 'Mayor' else: district = re.findall( r'Ward.*', page.xpath('//div[@id="printAreaContent"]//h1') [0].text_content())[0].strip() role = page.xpath( '//div[@id="printAreaContent"]//h1')[0].text_content() role = re.findall('((Regional)? ?(Councillor))', role)[0][0] p = Legislator(name=name, post_id=district, role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.image = page.xpath( '//div[@class="intQuicklinksPhoto"]/img/@src')[0] contact_info = page.xpath('//table[@class="datatable"][1]//tr')[1:] for line in contact_info: contact_type = line.xpath('./td')[0].text_content().strip() contact = line.xpath('./td')[1].text_content().strip() if re.match(r'(Phone)|(Fax)|(Email)', contact_type): contact_type = CONTACT_DETAIL_TYPE_MAP[contact_type] p.add_contact( contact_type, contact, None if contact_type == 'email' else 'legislature') else: p.add_link(contact, None) yield p
def get_people(self): yield scrape_mayor() page = lxmlize(COUNCIL_PAGE) councillor_cells = page.xpath('//th[contains(text(), "Ward")]') for cell in councillor_cells: district = cell.text name = cell[1].text page_url = cell[1].attrib['href'] page = lxmlize(page_url) p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(page_url) image = page.xpath('//div[@id="contentArea"]//img/@src') if image: p.image = image[0] address = page.xpath('//address//p') if address: address = address[0].text_content() p.add_contact('address', address, 'legislature') contacts = page.xpath('//table[@class="contactListing"]//tr') for contact in contacts: contact_type = contact.xpath('./th/text()')[0] value = contact.xpath('./td//text()')[0] if 'Title' in contact_type: continue elif 'Website' in contact_type or 'Facebook' in contact_type or 'Twitter' in contact_type: value = contact.xpath('./td/a/text()')[0] p.add_link(value, None) elif 'Telephone' in contact_type: p.add_contact('voice', value, 'legislature') elif 'Fax' in contact_type: p.add_contact('fax', value, 'legislature') elif 'Email' in contact_type: p.add_contact('email', value, None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//table[@id="MLAs"]//tr')[1:] for councillor in councillors: name = councillor.xpath("./td")[0].text_content().split(". ", 1)[1] party = councillor.xpath("./td")[1].text district = councillor.xpath("./td")[2].text_content() url = councillor.xpath("./td[1]/a/@href")[0] page = lxmlize(url) p = Legislator(name=name, post_id=district, role="MLA", party=party) p.add_source(COUNCIL_PAGE) p.add_source(url) contact = page.xpath('//table[@id="mla-contact"]//tr[2]')[0] website = contact.xpath("./td[3]//div[3]//a") if website: p.add_link(website[0].text_content(), None) p.add_contact("address", contact.xpath("./td[1]/div[2]")[0].text_content(), "legislature") p.add_contact("address", "".join(contact.xpath("./td[2]/div//text()")[1:7]), "constituency") numbers = [ contact.xpath("./td[1]/div[3]")[0].text_content().split(":")[1].strip(), contact.xpath("./td[2]/div[4]//span/text()")[0], contact.xpath("./td[1]/div[4]")[0].text_content().split(":")[1].strip(), contact.xpath("./td[2]/div[5]//span/text()")[0], ] for index, number in enumerate(numbers): if len(number) < 10: numbers[index] = "306-%s" % number p.add_contact("voice", numbers[0], "legislature") p.add_contact("voice", numbers[1], "constituency") p.add_contact("fax", numbers[2], "legislature") p.add_contact("fax", numbers[3], "constituency") p.add_contact("email", contact.xpath('./td[3]//a[contains(@href, "mailto:")]/text()')[0], None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//*[@class="two_third last"]') for councillor in councillors: if councillor == councillors[0]: yield self.scrape_mayor(councillor) continue name = councillor.xpath('.//a')[0].text_content().replace('Councillor', '').replace('Mayor', '') info = councillor.xpath('.//text()[normalize-space()]') district = info[2] url = councillor.xpath('.//a')[0].attrib['href'] p = Legislator(name=name, post_id=district, role='Councillor') p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('voice', info[3].replace('extension', 'x'), 'legislature') email = councillor.xpath('.//a[contains(@href,"mailto:")]') if email: email = email[0].text_content() p.add_contact('email', email, None) site = councillor.xpath('.//a[contains(text(),"Website")]') if site: p.add_link(site[0].attrib['href'], None) page = lxmlize(url) p.image = page.xpath('//header/img/@src')[0] address = re.findall(r'Address: (.*)Phone', page.xpath('//div[@class="entry-content"]')[0].text_content()) if address: p.add_contact('address', address[0], 'legislature') blog = page.xpath('//a[contains(text(),"Blog")]') if blog: p.add_link(blog[0].attrib['href'], None) facebook = page.xpath('//div[@class="entry-content"]//a[contains(@href, "facebook")]') if facebook: p.add_link(facebook[0].attrib['href'], None) twitter = page.xpath('//div[@class="entry-content"]//a[contains(@href, "twitter")]') if twitter: p.add_link(twitter[0].attrib['href'], None) yield p
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath('//div[@class="PL_Column1"]//ul[@class="dfwp-list"][1]/li/div/div/a') for councillor in councillors: url = councillor.attrib['href'] page = lxmlize(url) title = page.xpath('//div[@class="PL_Title"]')[0].text_content() if "Councillor" in title: district, name = re.split(r'Councillor', title) role = 'Councillor' if "Regional" in district: district = "Vaughan" role = 'Regional Councillor' else: name = re.split(r'Mayor', title)[-1] district = 'Vaughan' role = 'Mayor' name = name.strip() if councillor == councillors[0]: contact_info = page.xpath('//div[@id="WebPartWPQ2"]')[0] else: contact_info = page.xpath('//div[@id="WebPartWPQ3"]')[0] phone = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4} ext. [0-9]{4}', contact_info.text_content())[0].replace('ext. ', 'x') fax = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4}', contact_info.text_content())[1] email = contact_info.xpath('.//a[contains(@href, "mailto:")]')[0].text_content() p = Legislator(name=name, post_id=district.strip(), role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) image = page.xpath('//img[contains(@alt, "Councillor")]/@src') if image: p.image = image[0] sites = page.xpath('//div[@id="WebPartWPQ5"]')[0] if page.xpath('.//a[contains(@href,"facebook")]'): p.add_link(page.xpath('.//a[contains(@href,"facebook")]')[0].attrib['href'], None) if page.xpath('.//a[contains(@href,"twitter")]'): p.add_link(page.xpath('.//a[contains(@href,"twitter")]')[0].attrib['href'], None) if page.xpath('.//a[contains(@href,"youtube")]'): p.add_link(page.xpath('.//a[contains(@href, "youtube")]')[0].attrib['href'], None) yield p
def get_people(self): screen_names = json.loads(requests.get("http://scrapers-ruby.herokuapp.com/twitter_users").content) page = lxmlize(COUNCIL_PAGE) rows = page.xpath('//div[@class="main-content"]//tr')[1:] for row in rows: name_cell = row.xpath("./td[1]")[0] last_name = name_cell.xpath("string(.//span[1])") first_name = name_cell.xpath("string(.//span[2])") name = "%s %s" % (first_name, last_name) constituency = row.xpath("string(./td[2])") province = row.xpath("string(./td[3])") party = row.xpath("string(./td[4])") url = name_cell.xpath("string(.//a/@href)") mp_page = lxmlize(url) email = mp_page.xpath('string(//span[@class="caucus"]/' 'a[contains(., "@")])') photo = mp_page.xpath('string(//div[@class="profile overview header"]//' "img/@src)") m = Legislator(name=name, post_id=constituency, role="MP", chamber="lower", party=party) m.add_source(COUNCIL_PAGE) m.add_source(url) screen_name = screen_names.get(name) if screen_name: m.add_link("https://twitter.com/%s" % screen_name) # @see http://www.parl.gc.ca/Parliamentarians/en/members/David-Yurdiga%2886260%29 if email: m.add_contact("email", email, None) elif name == "Adam Vaughan": m.add_contact("email", "*****@*****.**", None) m.image = photo if mp_page.xpath('string(//span[@class="province"][1])') == u"Québec": m.add_contact("address", "Chambre des communes\nOttawa ON K1A 0A6", "legislature") else: m.add_contact("address", "House of Commons\nOttawa ON K1A 0A6", "legislature") voice = mp_page.xpath('string(//div[@class="hilloffice"]//span[contains(text(), "Telephone:")])') if voice: m.add_contact("voice", voice.replace("Telephone: ", ""), "legislature") fax = mp_page.xpath('string(//div[@class="hilloffice"]//span[contains(text(), "Fax:")])').replace( "Fax: ", "" ) if fax: m.add_contact("fax", fax, "legislature") for li in mp_page.xpath('//div[@class="constituencyoffices"]//li'): spans = li.xpath('./span[not(@class="spacer")]') m.add_contact( "address", "\n".join( [ spans[0].text_content(), # address line 1 spans[1].text_content(), # address line 2 spans[2].text_content(), # city, region spans[3].text_content(), # postal code ] ), "constituency", ) voice = li.xpath('string(./span[contains(text(), "Telephone:")])').replace("Telephone: ", "") if voice: m.add_contact("voice", voice, "constituency") fax = li.xpath('string(./span[contains(text(), "Fax:")])').replace("Fax: ", "") if fax: m.add_contact("fax", fax, "constituency") yield m
def get_people(self): page = lxmlize(COUNCIL_PAGE) councillors = page.xpath( '//div[@class="PL_Column1"]//ul[@class="dfwp-list"][1]/li/div/div/a' ) for councillor in councillors: url = councillor.attrib['href'] page = lxmlize(url) title = page.xpath('//div[@class="PL_Title"]')[0].text_content() if "Councillor" in title: district, name = re.split(r'Councillor', title) role = 'Councillor' if "Regional" in district: district = "Vaughan" role = 'Regional Councillor' else: name = re.split(r'Mayor', title)[-1] district = 'Vaughan' role = 'Mayor' name = name.strip() if councillor == councillors[0]: contact_info = page.xpath('//div[@id="WebPartWPQ2"]')[0] else: contact_info = page.xpath('//div[@id="WebPartWPQ3"]')[0] phone = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4} ext. [0-9]{4}', contact_info.text_content())[0].replace( 'ext. ', 'x') fax = re.findall(r'[0-9]{3}-[0-9]{3}-[0-9]{4}', contact_info.text_content())[1] email = contact_info.xpath( './/a[contains(@href, "mailto:")]')[0].text_content() p = Legislator(name=name, post_id=district.strip(), role=role) p.add_source(COUNCIL_PAGE) p.add_source(url) p.add_contact('voice', phone, 'legislature') p.add_contact('fax', fax, 'legislature') p.add_contact('email', email, None) image = page.xpath('//img[contains(@alt, "Councillor")]/@src') if image: p.image = image[0] sites = page.xpath('//div[@id="WebPartWPQ5"]')[0] if page.xpath('.//a[contains(@href,"facebook")]'): p.add_link( page.xpath('.//a[contains(@href,"facebook")]') [0].attrib['href'], None) if page.xpath('.//a[contains(@href,"twitter")]'): p.add_link( page.xpath('.//a[contains(@href,"twitter")]') [0].attrib['href'], None) if page.xpath('.//a[contains(@href,"youtube")]'): p.add_link( page.xpath('.//a[contains(@href, "youtube")]') [0].attrib['href'], None) yield p
def get_people(self): reader = csv_reader(COUNCIL_PAGE, header=True) for row in reader: kwargs = {'role': 'candidate'} email = None links = [] extra = {} offices = [] for k, v in row.items(): v = v.strip() if not v: continue k = k.strip() match = re.search(r'\AOffice (\d): ', k) if match: index = int(match.group(1)) while index > len(offices): offices.append({}) if k[10:] == 'Type': offices[index - 1]['note'] = v elif k[10:] in CONTACT_TYPE_KEYS: offices[index - 1][CONTACT_TYPE_KEYS[k[10:]]] = v else: raise Exception(k) elif k == 'Party Name': kwargs['party'] = PARTY_MAP[v] elif k in KEYS: kwargs[KEYS[k]] = v elif k == 'Email': email = v elif k in LINKS_KEYS: links.append({'url': v, 'note': k}) elif k in IGNORE_KEYS: continue elif k in EXTRA_KEYS: extra[re.sub(r'[^a-z0-9_]', '', k.lower().replace(' ', '_'))] = v else: raise Exception(k) contacts = [] for office in offices: for _, type in CONTACT_TYPE_KEYS.items(): if office.get(type): contacts.push({ 'note': office['note'], type: type, 'value': office[type] }) if 'name' in kwargs: p = Legislator(**kwargs) p.add_source(COUNCIL_PAGE) if email: p.add_contact('email', email, None) for link in links: p.add_link(**links) for contact in contacts: p.add_contact(**contact) for k, v in extra.items(): p.add_extra(k, v) yield p