def extract_table_data(pct_name, s, facility_type): """ Extracts data from a list of PCT facilities """ services = [] d = {} for t in s.getchildren(): if t.tag == "dt": if d != {}: services.append(d) d = {"PCT": pct_name, "type": "service"} u = t.find("a") if u != None: t = u d["info HTML"] = "http://www.nhs.uk" + t.attrib["href"] name = (t.text or "").strip() d["name"] = name print name elif t.text[:4] == "tel:": d["telephone"] = t.text[5:] else: address = t.text d["address"] = address postcode = geo.extract_gb_postcode(address) d["postcode"] = postcode d["latlng"] = geo.gb_postcode_to_latlng(postcode) for d in services: if "info HTML" in d: scrape_extra(d, facility_type) datastore.save(unique_keys=["PCT", "type", "name", "address"], data=d)
def latlng(self): from scraperwiki import geo if self[u'Address of Proposal']: return geo.gb_postcode_to_latlng( geo.extract_gb_postcode(self[u'Address of Proposal'])) else: return None
def read_town(town): br.open(url) assert br.viewing_html() br.select_form(name="finderForm") br[postcode] = town print br[office_type] br[office_type] = ["12"] res2 = br.submit() assert br.viewing_html() page_num = 1 # print res2.info() # headers while True: page = res2.read() assert page if "The details you have entered did not find any matches." in page: print town, page_num, "no results" assert page_num == 1 return print town, page_num if "bf-results" not in page: raise SearchErrors for po in parse_page(page): latlng = gb_postcode_to_latlng(po["postcode"]) scraperwiki.sqlite.save(unique_keys=["name", "postcode"], data=po, latlng=latlng) page_num += 1 try: res2 = br.follow_link(text_regex="^next") except mechanize.LinkNotFoundError: break assert br.viewing_html()
def extract_table_data(pct_name,s,facility_type): """ Extracts data from a list of PCT facilities """ services = [] d = {} for t in s.getchildren(): if t.tag=="dt": if d != {}: services.append(d) d = {"PCT":pct_name, "type":"service"} u = t.find("a") if u != None: t = u d["info HTML"] = "http://www.nhs.uk" + t.attrib["href"] name = (t.text or "").strip() d["name"] = name print name elif t.text[:4]=="tel:": d["telephone"]=t.text[5:] else: address = t.text d["address"] = address postcode = geo.extract_gb_postcode(address) d["postcode"] = postcode d["latlng"] = geo.gb_postcode_to_latlng(postcode) for d in services: if "info HTML" in d: scrape_extra(d,facility_type) datastore.save(unique_keys=["PCT","type","name","address"], data=d)
def read_town(town): br.open(url) assert br.viewing_html() br.select_form(name='finderForm') br[postcode] = town print br[office_type] br[office_type] = ['12'] res2 = br.submit() assert br.viewing_html() page_num = 1 #print res2.info() # headers while True: page = res2.read() assert page if 'The details you have entered did not find any matches.' in page: print town, page_num, 'no results' assert page_num == 1 return print town, page_num if 'bf-results' not in page: raise SearchErrors for po in parse_page(page): latlng=gb_postcode_to_latlng(po['postcode']) sqlite.save(unique_keys=['name', 'postcode'], data=po, latlng=latlng) page_num += 1 try: res2 = br.follow_link(text_regex='^next') except mechanize.LinkNotFoundError: break assert br.viewing_html()
def read_town(town): br.open(url) assert br.viewing_html() br.select_form(name='finderForm') br[postcode] = town print br[office_type] br[office_type] = ['12'] res2 = br.submit() assert br.viewing_html() page_num = 1 #print res2.info() # headers while True: page = res2.read() assert page if 'The details you have entered did not find any matches.' in page: print town, page_num, 'no results' assert page_num == 1 return print town, page_num if 'bf-results' not in page: raise SearchErrors for po in parse_page(page): latlng=gb_postcode_to_latlng(po['postcode']) scraperwiki.sqlite.save(unique_keys=['name', 'postcode'], data=po, latlng=latlng) page_num += 1 try: res2 = br.follow_link(text_regex='^next') except mechanize.LinkNotFoundError: break assert br.viewing_html()
def scrape_pct(link, pct_name): """ Scrapes the data associated with the PCT, and calls functions to scrape data associated with the services. """ print print print pct_name print "-" * len(pct_name) url = "http://www.nhs.uk" + link parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(scrape(url)) root = page.getroot() d = {} # basic contact details d["PCT"] = pct_name d["type"] = "main" d["name"] = pct_name address = root.find("body/div/form/div/div/p").text d["address"] = address postcode = geo.extract_gb_postcode(address) d["postcode"] = postcode d["latlng"] = geo.gb_postcode_to_latlng(postcode) d["info HTML"] = url # quality for t in root.findall( "body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']" ): k = t.find("div/h4").text.strip() v = t.find("div/img").attrib["alt"] d[k] = v # head honcho for t in root.findall( "body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']" ): d["Boss"] = t.text.replace("<br />", ", ") # boring text for t in root.findall("body/div/form/div/div/div/div/div/div/p"): if t.text: if t.attrib.get("class", False) == "intro": d["intro text"] = t.text else: d["boilerplate"] = d.get("boilerplate", "") + "\n" + t.text datastore.save(unique_keys=["PCT", "type", "name", "address"], data=d, latlng=d.get("latlng")) scrape_facilities(pct_name, root) scrape_others(pct_name, url)
def read_town(town): br.open(url) assert br.viewing_html() br.select_form(nr=1) br[postcode] = town res2 = br.submit() assert br.viewing_html() page_num = 1 #print res2.info() # headers while True: page = res2.read() assert page if 'The details you have entered did not find any matches.' in page: print town, page_num, 'no results' assert page_num == 1 return print town, page_num if 'bf-results' not in page: print 'search error' raise SearchErrors print "calling parse_page" for po in parse_page(page): print po latlon = None for attempt in range(10): try: latlon = gb_postcode_to_latlng(po['postcode']) break except: pass print 'gb_postcode_to_latlng fail for "%s", attempt %d' % ( po['postcode'], attempt) sleep(10) if latlon: (po['lat'], po['lon']) = latlon sqlite.save(unique_keys=['name', 'postcode'], data=po) page_num += 1 link_not_found = False for attempt in range(5): try: res2 = br.follow_link(text_regex='^next') break except mechanize.LinkNotFoundError: link_not_found = True break except URLError: # try again if attempt == 4: raise print 'retry, attempt:', attempt if link_not_found: break assert br.viewing_html()
def read_town(town): br.open(url) assert br.viewing_html() br.select_form(nr=1) br[postcode] = town res2 = br.submit() assert br.viewing_html() page_num = 1 #print res2.info() # headers while True: page = res2.read() assert page if 'The details you have entered did not find any matches.' in page: print town, page_num, 'no results' assert page_num == 1 return print town, page_num if 'bf-results' not in page: print 'search error' raise SearchErrors print "calling parse_page" for po in parse_page(page): print po latlon=None for attempt in range(10): try: latlon=gb_postcode_to_latlng(po['postcode']) break except: pass print 'gb_postcode_to_latlng fail for "%s", attempt %d' % (po['postcode'], attempt) sleep(10) if latlon: (po['lat'], po['lon'])=latlon sqlite.save(unique_keys=['name', 'postcode'], data=po) page_num += 1 link_not_found = False for attempt in range(5): try: res2 = br.follow_link(text_regex='^next') break except mechanize.LinkNotFoundError: link_not_found = True break except URLError: # try again if attempt == 4: raise print 'retry, attempt:', attempt if link_not_found: break assert br.viewing_html()
def scrape_pct(link,pct_name): """ Scrapes the data associated with the PCT, and calls functions to scrape data associated with the services. """ print print print pct_name print "-"*len(pct_name) url = "http://www.nhs.uk" + link parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(scrape(url)) root = page.getroot() d = {} # basic contact details d["PCT"] = pct_name d["type"] = "main" d["name"] = pct_name address = root.find("body/div/form/div/div/p").text d["address"] = address postcode = geo.extract_gb_postcode(address) d["postcode"] = postcode d["latlng"] = geo.gb_postcode_to_latlng(postcode) d["info HTML"] = url # quality for t in root.findall("body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']"): k = t.find("div/h4").text.strip() v = t.find("div/img").attrib["alt"] d[k] = v # head honcho for t in root.findall("body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']"): d["Boss"] = t.text.replace("<br />",", ") # boring text for t in root.findall("body/div/form/div/div/div/div/div/div/p"): if t.text: if t.attrib.get("class",False)=="intro": d["intro text"] = t.text else: d["boilerplate"] = d.get("boilerplate","")+"\n"+t.text datastore.save(unique_keys=["PCT","type","name","address"], data=d, latlng=d.get("latlng")) scrape_facilities(pct_name,root) scrape_others(pct_name,url)
def scrape_pct(link,pct_name): """ Scrapes the data associated with the PCT, and calls functions to scrape data associated with the services. """ url = "http://www.nhs.uk" + link root = lxml.html.parse(url).getroot() d = {} # basic contact details d["PCT"] = pct_name d["type"] = "main" d["name"] = pct_name print lxml.html.tostring(root) address = root.cssselect("div.panel-content div.pad p")[0].text d["address"] = address d["postcode"]= geo.extract_gb_postcode(address) try: d["lat"], d["lng"] = geo.gb_postcode_to_latlng(d["postcode"]) except: print "Postcode not found", d["postcode"] d["info HTML"] = url colour = "green" # quality for t in root.findall("body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']"): k = t.find("div/h4").text.strip() v = t.find("div/img").attrib["alt"] d[k] = v if k == "Fair": colour = "yellow" d["colour"] = colour # head honcho for t in root.findall("body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']"): d["Boss"] = t.text.replace("<br />",", ") # boring text for t in root.findall("body/div/form/div/div/div/div/div/div/p"): if t.text: if t.attrib.get("class",False)=="intro": d["intro text"] = t.text else: d["boilerplate"] = (d.get("boilerplate","")+"\n"+t.text).strip() sqlite.save(unique_keys=["PCT","type","name"], data=d) scrape_facilities(pct_name,root) scrape_others(pct_name,url)
def tests(): pc = extract_gb_postcode('10 Romford Road Preston Lancashire') print pc, gb_postcode_to_latlng(pc)
del pub[k] results.append(pub) return results scraper = Scrape() for code in Outcodes(): if code['outcode'][:2] == "IP" or code['outcode'][:2] == "NR": scraper.scrape(code['outcode']) results = scraper.parse() for pub in results: datastore.save(['name', 'address-postcode'], pub, latlng=geo.gb_postcode_to_latlng( pub['address-postcode'])) import csv import re import urllib2 import BeautifulSoup from scraperwiki import datastore from scraperwiki import geo class Outcodes(): def __init__(self): self.download_outcodes()
def latlng(self): from scraperwiki import geo return geo.gb_postcode_to_latlng( geo.extract_gb_postcode(self[u'Address of Proposal']))
for k,v in pub.items(): if not v: del pub[k] results.append(pub) return results scraper = Scrape() for code in Outcodes(): if code['outcode'][:2] == "IP" or code['outcode'][:2] == "NR": scraper.scrape(code['outcode']) results = scraper.parse() for pub in results: datastore.save(['name','address-postcode'], pub, latlng=geo.gb_postcode_to_latlng(pub['address-postcode'])) import csv