def scrape(workspace): print "Scraping A&E Waiting Times with workspace {}".format(workspace) html = requests.get(ROOT) page = fromstring(html.content) h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Weekly Data and Quarterly Aggregates']) links = h3.getnext().cssselect('a') h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Monthly Data']) links += h3.getnext().cssselect('a') for l in links: print l datasets = [] for l in links: try: datasets.extend(scrape_page(l.get("href"))) except: import traceback traceback.print_exc() datasets = filter(lambda x: x is not None, datasets) print "Processed {} datasets".format(len(datasets)) return datasets
def scrape(workspace): print "Scraping Delayed Transfer {}".format(workspace) global DEFAULT_NOTES html = requests.get(ROOT) page = fromstring(html.content) default_notes(page) h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Data']) links = h3.getnext().cssselect('a') datasets = [] datasets.extend(scrape_page(links[-1].get("href"))) for l in links: datasets.extend(scrape_page(l.get("href"))) # Get the annual statistical reports h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Annual Statistical Report']) links = h3.getnext().cssselect('a') dataset = { "resources": [anchor_to_resource(l) for l in links], "title": "Delayed Transfers of Care - Annual Statistical Reports", "origin": ROOT, "notes": DEFAULT_NOTES, "frequency": "Annually", "groups": ['delayed_transfer'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() datasets.append(dataset) datasets = filter(lambda x: x is not None, datasets) print "Processed {} datasets".format(len(datasets)) return datasets
def history(page): link = hd(hd(page.cssselect('#ctlNavigationPastResults')).cssselect('a')) page = get_dom(urljoin(ROOT, link.get('href'))) div = hd(page.cssselect('.foldout-set')) links = [a for a in div.cssselect('a') if 'Detailed Spreadsheets' in a.text_content().strip()] for link in links: u = urljoin(ROOT, link.get('href')) page = get_dom(u) yield scrape_page(page, u)
def latest(page): # Find the Latest Data link at "http://www.nhsstaffsurveys.com/" and scrape # that page. link = hd(hd(page.cssselect('#ctlNavigationLatestResults')).cssselect('a')) page = get_dom(urljoin(ROOT, link.get('href'))) h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip() == "Detailed spreadsheets"]) latest_link = hd(h3.getparent().getnext().cssselect('a')) u = urljoin(ROOT, latest_link.get('href')) page = get_dom(u) return scrape_page(page, u)
def history(page): link = hd(hd(page.cssselect('#ctlNavigationPastResults')).cssselect('a')) page = get_dom(urljoin(ROOT, link.get('href'))) div = hd(page.cssselect('.foldout-set')) links = [ a for a in div.cssselect('a') if 'Detailed Spreadsheets' in a.text_content().strip() ] for link in links: u = urljoin(ROOT, link.get('href')) page = get_dom(u) yield scrape_page(page, u)
def scrape(workspace): print "Scraping CWT with workspace {}".format(workspace) datasets = [] bases = [ 'http://www.england.nhs.uk/statistics/statistical-work-areas/cancer-waiting-times/provider-based-cancer-waiting-times-statistics/', 'http://www.england.nhs.uk/statistics/statistical-work-areas/cancer-waiting-times/commissioner-based-cancer-waiting-times-statistics/' ] targets = [] for base in bases: html = requests.get(base) page = fromstring(html.content) h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip().lower() == 'latest statistics']) links = [a.get('href') for a in h3.getnext().cssselect('a')] for l in links: print l targets += links for t in targets: datasets.append(scrape_commissioner_page(t)) # datasets.extend(commissioner_based()) # datasets.extend(default_cwt()) datasets = filter(lambda x: x is not None, datasets) return datasets
def latest(page): # Find the Latest Data link at "http://www.nhsstaffsurveys.com/" and scrape # that page. link = hd(hd(page.cssselect('#ctlNavigationLatestResults')).cssselect('a')) page = get_dom(urljoin(ROOT, link.get('href'))) h3 = hd([ h for h in page.cssselect('h3') if h.text_content().strip() == "Detailed spreadsheets" ]) latest_link = hd(h3.getparent().getnext().cssselect('a')) u = urljoin(ROOT, latest_link.get('href')) page = get_dom(u) return scrape_page(page, u)
def get_groups(self): if self.groups: return self.groups if 'Improving Access to Psychological Therapies' in self.dataset['title']: self.groups.append('IAPT') if 'Hospital Episode Statistics' in self.dataset['title']: self.groups.append('HES') if 'SHMI' in self.dataset['title']: self.groups.append('SHMI') # Check indicator specific data.... firsturl = hd([s['url'] for s in self.dataset.get('sources',[]) if s['description'] == 'Indicator specification']) if firsturl: if 'Clinical Commissioning Group Indicators' in firsturl: self.groups.append('CCGOIS') if 'Outcomes Framework' in firsturl: self.groups.append('NHSOF') for a in self.dataset.get('sources', []): if 'Quality and Outcomes Framework' in a['description']: self.groups.append('QOF') self.groups = list(set(self.groups)) if self.groups: print "***" * 20 print "Curated into a group {}".format(self.groups) print "***" * 20 return self.groups
def scrape(workspace): print "Scraping Child Immunisation with workspace {}".format(workspace) html = requests.get(ROOT).content page = fromstring(html) div = page.cssselect('.center')[0] links = div.cssselect('a')[3:] h3 = hd([ h for h in div.cssselect('h3') if h.text_content().strip() == "Background" ]) desc = h3.getnext().text_content() dataset = { "title": "Child Immunisation", "notes": to_markdown(fix_bad_unicode(unicode(desc))), "coverage_start_date": "", "coverage_end_date": "", "resources": [], "frequency": "Quarterly", "origin": ROOT, "tags": ["immunisation", "children"], "groups": ['child_immunisation'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() earliest_quarter, earliest_year = 4, 9999 latest_quarter, latest_year = 1, 2000 for l in links: y, q = get_quarter_and_year(l.text_content().strip()) if y < earliest_year: earliest_year = y if q < earliest_quarter: earliest_quarter = q if y > latest_year: latest_year = y if latest_quarter > q: latest_quarter = q dataset["resources"].append(anchor_to_resource(l)) if earliest_quarter == 4: earliest_year += 1 if latest_quarter == 4: latest_year += 1 s, e = QUARTERS[earliest_quarter] dataset["coverage_start_date"] = "{}-{}-01".format(earliest_year, str(s).zfill(2)) s, e = QUARTERS[latest_quarter] _, last_day = calendar.monthrange(latest_year, s - 1) dataset["coverage_end_date"] = "{}-{}-{}".format(earliest_year, str(s - 1).zfill(2), last_day) return [dataset]
def scrape_page(page, url): dataset = { "title": "NHS {}".format(hd(page.cssselect('h1')).text_content().strip()), } dataset["name"] = slugify.slugify(dataset["title"]).lower() dataset["origin"] = url dataset["tags"] = ["staff survey"] year = re.match('.*(\d{4}).*', dataset['title']).groups()[0] dataset["coverage_start_date"] = "{}-01-01".format(year) dataset["coverage_end_date"] = "{}-12-31".format(year) desc_node = page.cssselect('div.column-content p') if desc_node: dataset["notes"] = hd(desc_node).text_content() else: dataset["notes"] = "Results for the Staff Survey {year} can be seen below. "\ "We have released detailed spreadsheets showing key finding "\ "and question level information for each trust who took part "\ "in the {year} survey.".format(year=year) dataset['notes'] = to_markdown(dataset['notes']) dataset["resources"] = [] boxes = page.cssselect('.document-box') for box in boxes: a = box.cssselect('a')[0] resource = anchor_to_resource(a) resource['description'] = box.cssselect('h4')[0].text_content().strip() resource['url'] = urljoin(ROOT, resource['url']) dataset["resources"].append(resource) key = hd([ a for a in page.cssselect('a') if a.text_content().strip() == 'Click here' ]) if key is not None: resource = anchor_to_resource(key) resource['description'] = "Key Findings" resource['url'] = urljoin(ROOT, resource['url']) dataset["resources"].append(resource) return dataset
def scrape(workspace): print "Scraping Child Immunisation with workspace {}".format(workspace) html = requests.get(ROOT).content page = fromstring(html) div = page.cssselect('.center')[0] links = div.cssselect('a')[3:] h3 = hd([h for h in div.cssselect('h3') if h.text_content().strip() == "Background"]) desc = h3.getnext().text_content() dataset = { "title": "Child Immunisation", "notes": to_markdown(fix_bad_unicode(unicode(desc))), "coverage_start_date": "", "coverage_end_date": "", "resources": [], "frequency": "Quarterly", "origin": ROOT, "tags": ["immunisation", "children"], "groups": ['child_immunisation'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() earliest_quarter, earliest_year = 4, 9999 latest_quarter, latest_year = 1, 2000 for l in links: y, q = get_quarter_and_year(l.text_content().strip()) if y < earliest_year: earliest_year = y if q < earliest_quarter: earliest_quarter = q if y > latest_year: latest_year = y if latest_quarter > q: latest_quarter = q dataset["resources"].append(anchor_to_resource(l)) if earliest_quarter == 4: earliest_year += 1 if latest_quarter == 4: latest_year += 1 s, e = QUARTERS[earliest_quarter] dataset["coverage_start_date"] = "{}-{}-01".format(earliest_year, str(s).zfill(2)) s, e = QUARTERS[latest_quarter] _, last_day = calendar.monthrange(latest_year, s-1) dataset["coverage_end_date"] = "{}-{}-{}".format(earliest_year, str(s-1).zfill(2), last_day) return [dataset]
def scrape_page(page, url): dataset = { "title": "NHS {}".format(hd(page.cssselect('h1')).text_content().strip()), } dataset["name"] = slugify.slugify(dataset["title"]).lower() dataset["origin"] = url dataset["tags"] = ["staff survey"] year = re.match('.*(\d{4}).*', dataset['title']).groups()[0] dataset["coverage_start_date"] = "{}-01-01".format(year) dataset["coverage_end_date"] = "{}-12-31".format(year) desc_node = page.cssselect('div.column-content p') if desc_node: dataset["notes"] = hd(desc_node).text_content() else: dataset["notes"] = "Results for the Staff Survey {year} can be seen below. "\ "We have released detailed spreadsheets showing key finding "\ "and question level information for each trust who took part "\ "in the {year} survey.".format(year=year) dataset['notes'] = to_markdown(dataset['notes']) dataset["resources"] = [] boxes = page.cssselect('.document-box') for box in boxes: a = box.cssselect('a')[0] resource = anchor_to_resource(a) resource['description'] = box.cssselect('h4')[0].text_content().strip() resource['url'] = urljoin(ROOT, resource['url']) dataset["resources"].append(resource) key = hd([a for a in page.cssselect('a') if a.text_content().strip() == 'Click here']) if key is not None: resource = anchor_to_resource(key) resource['description'] = "Key Findings" resource['url'] = urljoin(ROOT, resource['url']) dataset["resources"].append(resource) return dataset
def scrape(workspace): print "Scraping Delayed Transfer {}".format(workspace) global DEFAULT_NOTES html = requests.get(ROOT) page = fromstring(html.content) default_notes(page) h3 = hd([ h for h in page.cssselect('h3') if h.text_content().strip() == 'Data' ]) links = h3.getnext().cssselect('a') datasets = [] datasets.extend(scrape_page(links[-1].get("href"))) for l in links: datasets.extend(scrape_page(l.get("href"))) # Get the annual statistical reports h3 = hd([ h for h in page.cssselect('h3') if h.text_content().strip() == 'Annual Statistical Report' ]) links = h3.getnext().cssselect('a') dataset = { "resources": [anchor_to_resource(l) for l in links], "title": "Delayed Transfers of Care - Annual Statistical Reports", "origin": ROOT, "notes": DEFAULT_NOTES, "frequency": "Annually", "groups": ['delayed_transfer'] } dataset["name"] = slugify.slugify(dataset["title"]).lower() datasets.append(dataset) datasets = filter(lambda x: x is not None, datasets) print "Processed {} datasets".format(len(datasets)) return datasets
def scrape(workspace): print "Scraping Diagnostic Imaging Data with workspace {}".format(workspace) datasets = [] page = get_dom(ROOT) datasets.append(scrape_page(ROOT, title="Diagnostic Imaging Dataset - Previous versions")) h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip() == "Data"]) for a in h3.getnext().cssselect('a'): datasets.append(scrape_page(a.get('href'))) datasets = filter(lambda x: x is not None, datasets) return datasets
def scrape(workspace): print "Scraping Critical Care Capacity {}".format(workspace) html = requests.get(ROOT) page = fromstring(html.content) default_notes(page) h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Latest Data']) links = h3.getnext().cssselect('a') datasets = [] datasets.extend(scrape_page(links[-1].get("href"))) for l in links: datasets.extend(scrape_page(l.get("href"))) datasets = filter(lambda x: x is not None, datasets) print "Processed {} datasets".format(len(datasets)) return datasets
def scrape(workspace): print "Scraping Diagnostic Imaging Data with workspace {}".format( workspace) datasets = [] page = get_dom(ROOT) datasets.append( scrape_page(ROOT, title="Diagnostic Imaging Dataset - Previous versions")) h3 = hd([ h for h in page.cssselect('h3') if h.text_content().strip() == "Data" ]) for a in h3.getnext().cssselect('a'): datasets.append(scrape_page(a.get('href'))) datasets = filter(lambda x: x is not None, datasets) return datasets
def default_notes(page): """ Some pages don't have a description. If we have no DEFAULT_NOTES then see if we can find them on the current page for the use in later pages """ global DEFAULT_NOTES if DEFAULT_NOTES: return print "Getting default notes" p = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Background']) if p is None: return desc = [] while True: p = p.getnext() if p.tag not in ['p', 'ul']: break s = tostring(p) s = s.replace('&', '&') desc.append(s) DEFAULT_NOTES = to_markdown("".join(desc))
def default_notes(page): """ Some pages don't have a description. If we have no DEFAULT_NOTES then see if we can find them on the current page for the use in later pages """ global DEFAULT_NOTES if DEFAULT_NOTES: return print "Getting default notes" p = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Background']) if p is None: return desc = [] while True: p = p.getnext() if p.tag != 'p': break s = p.text_content().strip() s = s.replace('&', '&') desc.append(s) DEFAULT_NOTES = to_markdown("".join(desc))
def scrape(workspace): print "Scraping Critical Care Capacity {}".format(workspace) html = requests.get(ROOT) page = fromstring(html.content) default_notes(page) h3 = hd([ h for h in page.cssselect('h3') if h.text_content().strip() == 'Latest Data' ]) links = h3.getnext().cssselect('a') datasets = [] datasets.extend(scrape_page(links[-1].get("href"))) for l in links: datasets.extend(scrape_page(l.get("href"))) datasets = filter(lambda x: x is not None, datasets) print "Processed {} datasets".format(len(datasets)) return datasets
def get_groups(self): if self.groups: return self.groups if 'Improving Access to Psychological Therapies' in self.dataset[ 'title']: self.groups.append('IAPT') if 'Hospital Episode Statistics' in self.dataset['title']: self.groups.append('HES') if 'SHMI' in self.dataset['title']: self.groups.append('SHMI') # Check indicator specific data.... firsturl = hd([ s['url'] for s in self.dataset.get('sources', []) if s['description'] == 'Indicator specification' ]) if firsturl: if 'Clinical Commissioning Group Indicators' in firsturl: self.groups.append('CCGOIS') if 'Outcomes Framework' in firsturl: self.groups.append('NHSOF') for a in self.dataset.get('sources', []): if 'Quality and Outcomes Framework' in a['description']: self.groups.append('QOF') self.groups = list(set(self.groups)) if self.groups: print "***" * 20 print "Curated into a group {}".format(self.groups) print "***" * 20 return self.groups
def get_page_count(): dom = get_dom(ROOT.format(1)) return int(hd(dom.cssselect('#paging li a.last')).text_content())
def process_monthly(page): datasets = [] title = "Monthly Hospital Activity Data" description = "Monthly activity data relating to elective and non-elective inpatient "\ "admissions (FFCEs) and outpatient referrals and attendances for first "\ "consultant outpatient appointments." headers = page.cssselect('h3,h4') for h in headers: text = h.text_content().strip() if re.match("(\d{4})-(\d{2})", text): datasets.extend( process_block(h, _p_strong("Provider based"), _p_strong("Commissioner based"), title, description, QUARTERLY)) provider_links, commissioner_links = [], [] h3prev = hd([ h for h in page.cssselect('h3') if h.text_content().strip().startswith("Previous") ]) p = h3prev.getnext() while True: if len(p) == 0: break if _p_strong("Provider based")(p): provider_links = p.getnext().cssselect('a') if _p_strong("Commissioner based")(p): commissioner_links = p.getnext().cssselect('a') p = p.getnext() for l in provider_links: m = re.match(".*(\d{4})-\d{2}.*", l.text_content().encode('utf8')) yr = int(m.groups()[0]) csd = "{}-04-01".format(yr) ced = "{}-03-31".format(yr + 1) pdataset = { "title": "{} - Provider based - {}-{}".format(title, yr, yr + 1), "notes": description, "origin": MONTHLY, "resources": [anchor_to_resource(l)], "frequency": "Annual", "coverage_start_date": csd, "coverage_end_date": ced, "groups": ['hospital_activity'] } pdataset["name"] = slugify.slugify(pdataset["title"]).lower() datasets.append(pdataset) for l in commissioner_links: m = re.match(".*(\d{4})-\d{2}.*", l.text_content().encode('utf8')) yr = int(m.groups()[0]) csd = "{}-04-01".format(yr) ced = "{}-03-31".format(yr + 1) cdataset = { "title": "{} - Provider based - {}-{}".format(title, yr, yr + 1), "notes": description, "origin": MONTHLY, "resources": [anchor_to_resource(l)], "frequency": "Annual", "coverage_start_date": csd, "coverage_end_date": ced, "groups": ['hospital_activity'] } cdataset["name"] = slugify.slugify(cdataset["title"]).lower() datasets.append(cdataset) time_series_links = [ a for a in page.cssselect('a') if 'Timeseries' in a.get('href') ] dataset = { "title": "{} - Time Series".format(title), "notes": description, "origin": MONTHLY, "resources": [anchor_to_resource(a) for a in time_series_links] } dataset["name"] = slugify.slugify(dataset["title"]).lower() datasets.append(dataset) return datasets
def process_monthly(page): datasets = [] title = "Monthly Hospital Activity Data" description = "Monthly activity data relating to elective and non-elective inpatient "\ "admissions (FFCEs) and outpatient referrals and attendances for first "\ "consultant outpatient appointments." headers = page.cssselect('h3,h4') for h in headers: text = h.text_content().strip() if re.match("(\d{4})-(\d{2})", text): datasets.extend(process_block(h, _p_strong("Provider based"), _p_strong("Commissioner based"),title, description,QUARTERLY)) provider_links, commissioner_links = [], [] h3prev = hd([h for h in page.cssselect('h3') if h.text_content().strip().startswith("Previous")]) p = h3prev.getnext() while True: if len(p) == 0: break if _p_strong("Provider based")(p): provider_links = p.getnext().cssselect('a') if _p_strong("Commissioner based")(p): commissioner_links = p.getnext().cssselect('a') p = p.getnext() for l in provider_links: m = re.match(".*(\d{4})-\d{2}.*", l.text_content().encode('utf8')) yr = int(m.groups()[0]) csd = "{}-04-01".format(yr) ced = "{}-03-31".format(yr+1) pdataset = { "title": "{} - Provider based - {}-{}".format(title, yr, yr+1), "notes": description, "origin": MONTHLY, "resources": [anchor_to_resource(l)], "frequency": "Annual", "coverage_start_date": csd, "coverage_end_date": ced, "groups": ['hospital_activity'] } pdataset["name"] = slugify.slugify(pdataset["title"]).lower() datasets.append(pdataset) for l in commissioner_links: m = re.match(".*(\d{4})-\d{2}.*", l.text_content().encode('utf8')) yr = int(m.groups()[0]) csd = "{}-04-01".format(yr) ced = "{}-03-31".format(yr+1) cdataset = { "title": "{} - Provider based - {}-{}".format(title, yr, yr+1), "notes": description, "origin": MONTHLY, "resources": [anchor_to_resource(l)], "frequency": "Annual", "coverage_start_date": csd, "coverage_end_date": ced, "groups": ['hospital_activity'] } cdataset["name"] = slugify.slugify(cdataset["title"]).lower() datasets.append(cdataset) time_series_links = [a for a in page.cssselect('a') if 'Timeseries' in a.get('href')] dataset = { "title": "{} - Time Series".format(title), "notes": description, "origin": MONTHLY, "resources": [anchor_to_resource(a) for a in time_series_links] } dataset["name"] = slugify.slugify(dataset["title"]).lower() datasets.append(dataset) return datasets