def extract_table_data(pct_name,s,facility_type): """ Extracts data from a list of PCT facilities """ services = [] d = {} for t in s.getchildren(): if t.tag=="dt": if d != {}: services.append(d) d = {"PCT":pct_name, "type":"service"} u = t.find("a") if u != None: t = u d["info HTML"] = "http://www.nhs.uk" + t.attrib["href"] name = (t.text or "").strip() d["name"] = name print name elif t.text[:4]=="tel:": d["telephone"]=t.text[5:] else: address = t.text d["address"] = address postcode = geo.extract_gb_postcode(address) d["postcode"] = postcode d["latlng"] = geo.gb_postcode_to_latlng(postcode) for d in services: if "info HTML" in d: scrape_extra(d,facility_type) datastore.save(unique_keys=["PCT","type","name","address"], data=d)
def latlng(self): from scraperwiki import geo if self[u'Address of Proposal']: return geo.gb_postcode_to_latlng( geo.extract_gb_postcode(self[u'Address of Proposal'])) else: return None
def extract_table_data(pct_name, s, facility_type): """ Extracts data from a list of PCT facilities """ services = [] d = {} for t in s.getchildren(): if t.tag == "dt": if d != {}: services.append(d) d = {"PCT": pct_name, "type": "service"} u = t.find("a") if u != None: t = u d["info HTML"] = "http://www.nhs.uk" + t.attrib["href"] name = (t.text or "").strip() d["name"] = name print name elif t.text[:4] == "tel:": d["telephone"] = t.text[5:] else: address = t.text d["address"] = address postcode = geo.extract_gb_postcode(address) d["postcode"] = postcode d["latlng"] = geo.gb_postcode_to_latlng(postcode) for d in services: if "info HTML" in d: scrape_extra(d, facility_type) datastore.save(unique_keys=["PCT", "type", "name", "address"], data=d)
def scrape_pct(link, pct_name): """ Scrapes the data associated with the PCT, and calls functions to scrape data associated with the services. """ print print print pct_name print "-" * len(pct_name) url = "http://www.nhs.uk" + link parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(scrape(url)) root = page.getroot() d = {} # basic contact details d["PCT"] = pct_name d["type"] = "main" d["name"] = pct_name address = root.find("body/div/form/div/div/p").text d["address"] = address postcode = geo.extract_gb_postcode(address) d["postcode"] = postcode d["latlng"] = geo.gb_postcode_to_latlng(postcode) d["info HTML"] = url # quality for t in root.findall( "body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']" ): k = t.find("div/h4").text.strip() v = t.find("div/img").attrib["alt"] d[k] = v # head honcho for t in root.findall( "body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']" ): d["Boss"] = t.text.replace("<br />", ", ") # boring text for t in root.findall("body/div/form/div/div/div/div/div/div/p"): if t.text: if t.attrib.get("class", False) == "intro": d["intro text"] = t.text else: d["boilerplate"] = d.get("boilerplate", "") + "\n" + t.text datastore.save(unique_keys=["PCT", "type", "name", "address"], data=d, latlng=d.get("latlng")) scrape_facilities(pct_name, root) scrape_others(pct_name, url)
def scrape_pct(link,pct_name): """ Scrapes the data associated with the PCT, and calls functions to scrape data associated with the services. """ print print print pct_name print "-"*len(pct_name) url = "http://www.nhs.uk" + link parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml")) page = parser.parse(scrape(url)) root = page.getroot() d = {} # basic contact details d["PCT"] = pct_name d["type"] = "main" d["name"] = pct_name address = root.find("body/div/form/div/div/p").text d["address"] = address postcode = geo.extract_gb_postcode(address) d["postcode"] = postcode d["latlng"] = geo.gb_postcode_to_latlng(postcode) d["info HTML"] = url # quality for t in root.findall("body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']"): k = t.find("div/h4").text.strip() v = t.find("div/img").attrib["alt"] d[k] = v # head honcho for t in root.findall("body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']"): d["Boss"] = t.text.replace("<br />",", ") # boring text for t in root.findall("body/div/form/div/div/div/div/div/div/p"): if t.text: if t.attrib.get("class",False)=="intro": d["intro text"] = t.text else: d["boilerplate"] = d.get("boilerplate","")+"\n"+t.text datastore.save(unique_keys=["PCT","type","name","address"], data=d, latlng=d.get("latlng")) scrape_facilities(pct_name,root) scrape_others(pct_name,url)
def scrape_pct(link,pct_name): """ Scrapes the data associated with the PCT, and calls functions to scrape data associated with the services. """ url = "http://www.nhs.uk" + link root = lxml.html.parse(url).getroot() d = {} # basic contact details d["PCT"] = pct_name d["type"] = "main" d["name"] = pct_name print lxml.html.tostring(root) address = root.cssselect("div.panel-content div.pad p")[0].text d["address"] = address d["postcode"]= geo.extract_gb_postcode(address) try: d["lat"], d["lng"] = geo.gb_postcode_to_latlng(d["postcode"]) except: print "Postcode not found", d["postcode"] d["info HTML"] = url colour = "green" # quality for t in root.findall("body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']"): k = t.find("div/h4").text.strip() v = t.find("div/img").attrib["alt"] d[k] = v if k == "Fair": colour = "yellow" d["colour"] = colour # head honcho for t in root.findall("body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']"): d["Boss"] = t.text.replace("<br />",", ") # boring text for t in root.findall("body/div/form/div/div/div/div/div/div/p"): if t.text: if t.attrib.get("class",False)=="intro": d["intro text"] = t.text else: d["boilerplate"] = (d.get("boilerplate","")+"\n"+t.text).strip() sqlite.save(unique_keys=["PCT","type","name"], data=d) scrape_facilities(pct_name,root) scrape_others(pct_name,url)
def entries_from_doc(doc): for row in doc.xpath('//div[@class="tablecontainer"]//tr'): columns = [col.text_content().strip() for col in row.xpath('./td')] if not columns: continue entry = {name: value for name, value in zip(COLUMN_NAMES, columns)} try: entry['valid_date'] = datetime.strptime(entry['valid_date'], DATE_FORMAT) except ValueError: pass entry['details_url'] = urljoin(URL, row.xpath('.//a/@href')[0]) postcode = extract_gb_postcode(entry['site_location']) if postcode: entry['postcode'] = postcode #latlng = gb_postcode_to_latlng(postcode) #if latlng: # entry['lat'], entry['lng'] = latlng yield entry
def tests(): pc = extract_gb_postcode('10 Romford Road Preston Lancashire') print pc, gb_postcode_to_latlng(pc)
def latlng(self): from scraperwiki import geo return geo.gb_postcode_to_latlng( geo.extract_gb_postcode(self[u'Address of Proposal']))