def process_block(block, provider_fn, commissioner_fn, title, desc, origin):
    # Continue cycling until we provider_fn matches an element
    # then getnext().cssselect('a'), same for commissioner_fn
    p = block.getnext()

    provider_links = []
    commissioner_links = []

    print block.text_content().strip()
    while True:
        if provider_fn(p):
            provider_links = p.getnext().cssselect('a')
        if commissioner_fn(p):
            commissioner_links = p.getnext().cssselect('a')
        if p.tag != 'p':
            break

        p = p.getnext()

    csd, ced = "", ""
    m = YEAR_MATCHER.match(block.text_content().strip())
    if m:
        syear = int(m.groups()[0])
        eyear = syear + 1
        csd = "{}-04-01".format(syear)
        ced = "{}-03-31".format(eyear)

    pdataset = {
        "title": "{} - Provider based - {}".format(title, block.text_content().strip()),
        "resources": [anchor_to_resource(a) for a in provider_links],
        "origin": origin,
        "notes": desc,
        "coverage_start_date": csd,
        "coverage_end_date": ced,
        "groups": ['hospital_activity']
    }
    pdataset["name"] = slugify.slugify(pdataset["title"]).lower()

    cdataset = {
        "title": "{} - Commissioner based - {}".format(title, block.text_content().strip()),
        "resources": [anchor_to_resource(a) for a in commissioner_links],
        "origin": origin,
        "notes": desc,
        "coverage_start_date": csd,
        "coverage_end_date": ced,
        "groups": ['hospital_activity']
    }
    cdataset["name"] = slugify.slugify(cdataset["title"]).lower()

    return [pdataset, cdataset]
Exemple #2
0
def add_year_block(header, url):
    m = re.match("(.*)(\d{4})", header.text_content().strip())
    h3 = header

    if h3.getnext() is None:
        # Sometimes the header is hidden in a div. Sigh.
        h3 = h3.getparent()

    links = []
    while h3 is not None:
        h3 = h3.getnext()
        if h3 is None or h3.tag != "p":
            break
        links.extend(h3.cssselect('a'))

    year = m.groups()[1]
    import string
    month = filter(lambda x: x in string.printable, m.groups()[0].strip())

    dataset = {
        "title": u"Critical Care Bed Capacity and Urgent Operations Cancelled - {} {}".format(month, year),
        "resources": [anchor_to_resource(l) for l in links],
        "notes": DEFAULT_NOTES,
        "origin": url,
        "frequency": "Monthly",
        "groups": ['ccc']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    mnth = list(calendar.month_name).index(month)
    _, e = calendar.monthrange(int(m.groups()[1]), mnth )
    dataset['coverage_start_date'] = "{}-{}-01".format(m.groups()[1].strip(), mnth)
    dataset['coverage_end_date'] = "{}-{}-{}".format(m.groups()[1].strip(), mnth, e)

    return dataset
Exemple #3
0
def scrape_page(url):
    dom = get_dom(url)

    description = to_markdown(''.join([tostring(d) for d in dom.cssselect('.summary')]))

    resources = []
    for a in dom.cssselect('.notevalue a'):
        href = a.get('href')
        if 'searchcatalogue' in href or '.exe' in href:
            continue
        if not "datagov.ic.nhs.uk" in href:
            continue
        resources.append(anchor_to_resource(a))

    dataset = {
        "title": dom.cssselect('#headingtext')[0].text_content().strip(),
        "notes": description,
        "resources": resources,
        "tags": ['prescibing'],
        "frequency": "Monthly",
        "origin": url
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()[:99]

    sdate, edate = get_date_range(dom)
    dataset["coverage_start_date"] = sdate or ""
    dataset["coverage_end_date"] = edate or ""

    return dataset
def process_para(para, notes):
    title = para.cssselect('strong')[0].text_content()
    if 'CSV Format' in title:  # We'll take the XLS version for now.
        return None

    dataset = {}

    part = title.encode('utf8')[0:7].replace('/', '-')
    s, e = "", ""
    if part == 'England':
        part = 'England Time Series'
    else:
        s, e = date_range_for_year(int(part[0:4]))
    dataset['coverage_start_date'] = s
    dataset['coverage_end_date'] = e

    dataset["title"] = "Mental Health Community Teams Activity - {}".format(
        part)
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    dataset[
        "origin"] = "http://www.england.nhs.uk/statistics/statistical-work-areas/mental-health-community-teams-activity/"
    dataset["notes"] = notes
    dataset['groups'] = ['mhc']
    links = para.cssselect('a')
    dataset['resources'] = [anchor_to_resource(l) for l in links]

    return dataset
def scrape(workspace):
    print "Scraping Maternity and Breastfeeding with workspace {}".format(workspace)
    datasets = []

    page = fromstring(requests.get(ROOT).content)

    links = [a for a in page.cssselect('.center a') if 'upload' in a.get('href')]
    print len(links)

    dataset = {
        "title": "Maternity and Breastfeeding Data",
        "tags": ["maternity", "breastfeeding"],
        "resources": [anchor_to_resource(a) for a in links],
        "notes": "This collection reports on the number and proportion "\
                 "of women seen and assessed by a healthcare professional "\
                 "within 12 weeks and 6 days of their maternity, the number "\
                 "and proportion of mothers' who have initiated or not "\
                 "initiated breastfeeding and the number and proportion of "\
                 "infants who have been fully, partially or not at all breastfed "\
                 "at 6-8 weeks",
        "origin": ROOT,
        "groups": ['maternity_breastfeeding']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    print dataset
    return [dataset]
Exemple #6
0
def add_year_block(header, url):
    m = re.match("(.*)(\d{4})", header.text_content().strip())
    h3 = header

    if h3.getnext() is None:
        # Sometimes the header is hidden in a div. Sigh.
        h3 = h3.getparent()

    links = []
    while h3 is not None:
        h3 = h3.getnext()
        if h3 is None or h3.tag != "p":
            break
        links.extend(h3.cssselect('a'))

    from publish.lib.encoding import fix_bad_unicode
    txt = fix_bad_unicode(unicode(header.text_content().strip()))

    dataset = {
        "title": u"A&E Attendances and Emergency Admissions - {}".format(txt),
        "resources": [anchor_to_resource(l) for l in links],
        "notes": DEFAULT_NOTES,
        "origin": url,
        "frequency": "Weekly",
        "groups": ['a_and_e']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    mname = m.groups()[0].strip().encode('ascii', 'ignore')
    mnth = list(calendar.month_name).index(mname)
    _, e = calendar.monthrange(int(m.groups()[1]), mnth )
    dataset['coverage_start_date'] = "{}-{}-01".format(m.groups()[1].strip(), mnth)
    dataset['coverage_end_date'] = "{}-{}-{}".format(m.groups()[1].strip(), mnth, e)

    return dataset
Exemple #7
0
def scrape_page(url, title=None):
    global FULL_DESC
    page = get_dom(url)

    if FULL_DESC is None:
        FULL_DESC = get_description(page)

    links = [
        a for a in page.cssselect('.center a') if 'upload' in a.get('href')
    ]
    dataset = {
        "title": title or page.cssselect('h1')[1].text_content().strip(),
        "notes": FULL_DESC,
        "origin": url,
        "tags": ["diagnostic imaging"],
        "resources": [anchor_to_resource(l) for l in links],
        "groups": ['did']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    syear, eyear = year_range_from_title(dataset["title"])
    if syear and eyear:
        dataset["coverage_start_date"] = "{}-04-01".format(syear)
        dataset["coverage_end_date"] = "{}-03-31".format(eyear)
    return dataset
Exemple #8
0
def scrape_page(url):
    """ Scrapes a single page to create a dataset """

    print "Scraping ", url
    page = get_dom(url)
    header = page.cssselect('h1')[1]

    title = header.text_content().strip().replace('/', '-')
    description = get_description(header)

    links = [a for a in page.cssselect('.center a') if 'upload' in a.get('href')]
    resources = [anchor_to_resource(l) for l in links]

    start_year, end_year = year_range_from_title(title)

    dataset = {
        "title": title,
        "notes": description,
        "resources": resources,
        "origin": url,
        "coverage_start_date": "{}-04-01".format(start_year),
        "coverage_end_date": "{}-03-31".format(end_year),
        "tags": ["VTE"],
        "groups": ["vte"]
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    print dataset["name"], " has ", len(dataset["resources"]), " resources"
    return dataset
def scrape(workspace):
    print "Scraping Delayed Transfer {}".format(workspace)
    global DEFAULT_NOTES

    html = requests.get(ROOT)
    page = fromstring(html.content)
    default_notes(page)

    h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Data'])
    links = h3.getnext().cssselect('a')

    datasets = []
    datasets.extend(scrape_page(links[-1].get("href")))
    for l in links:
        datasets.extend(scrape_page(l.get("href")))

    # Get the annual statistical reports
    h3 = hd([h for h in page.cssselect('h3') if h.text_content().strip() == 'Annual Statistical Report'])
    links = h3.getnext().cssselect('a')
    dataset = {
        "resources": [anchor_to_resource(l) for l in links],
        "title": "Delayed Transfers of Care - Annual Statistical Reports",
        "origin": ROOT,
        "notes": DEFAULT_NOTES,
        "frequency": "Annually",
        "groups": ['delayed_transfer']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    datasets.append(dataset)

    datasets = filter(lambda x: x is not None, datasets)
    print "Processed {} datasets".format(len(datasets))
    return datasets
Exemple #10
0
def scrape_page(url):
    dom = get_dom(url)

    description = to_markdown(''.join(
        [tostring(d) for d in dom.cssselect('.summary')]))

    resources = []
    for a in dom.cssselect('.notevalue a'):
        href = a.get('href')
        if 'searchcatalogue' in href or '.exe' in href:
            continue
        if not "datagov.ic.nhs.uk" in href:
            continue
        resources.append(anchor_to_resource(a))

    dataset = {
        "title": dom.cssselect('#headingtext')[0].text_content().strip(),
        "notes": description,
        "resources": resources,
        "tags": ['prescibing'],
        "frequency": "Monthly",
        "origin": url
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()[:99]

    sdate, edate = get_date_range(dom)
    dataset["coverage_start_date"] = sdate or ""
    dataset["coverage_end_date"] = edate or ""

    return dataset
def scrape_page(url):
    """ Scrapes a single page to create a dataset """

    print "Scraping ", url
    page = get_dom(url)
    header = page.cssselect('h1')[1]

    title = header.text_content().strip().replace('/', '-')
    description = get_description(header)

    links = [
        a for a in page.cssselect('.center a') if 'upload' in a.get('href')
    ]
    resources = [anchor_to_resource(l) for l in links]

    start_year, end_year = year_range_from_title(title)

    dataset = {
        "title": title,
        "notes": description,
        "resources": resources,
        "origin": url,
        "coverage_start_date": "{}-04-01".format(start_year),
        "coverage_end_date": "{}-03-31".format(end_year),
        "tags": ["VTE"],
        "groups": ["vte"]
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    print dataset["name"], " has ", len(dataset["resources"]), " resources"
    return dataset
Exemple #12
0
def scrape_page(url):
    page = fromstring(requests.get(url).content)

    links = [
        a for a in page.cssselect('a')
        if ('upload' in a.get('href')) or ('files' in a.get('href'))
    ]
    h1 = page.cssselect('h1')[1]
    desc = []
    p = h1.getparent().getnext()
    while True:
        if is_header(p):
            break
        desc.append(tostring(p))
        p = p.getnext()

    m = re.match(".*(\d{4}).*", h1.text_content().strip())
    year = int(m.groups()[0])

    dataset = {
        "title": h1.text_content().strip(),
        "tags": ["winter", "sitrep"],
        "resources": [anchor_to_resource(a) for a in links],
        "notes": to_markdown("".join(desc)),
        "origin": url,
        "coverage_start_date": "{}-11-01".format(year),
        "coverage_end_date": "{}-03-01".format(year + 1),
        "groups": ["winter"]
    }

    return dataset
Exemple #13
0
def scrape(workspace):
    print "Scraping Maternity and Breastfeeding with workspace {}".format(
        workspace)
    datasets = []

    page = fromstring(requests.get(ROOT).content)

    links = [
        a for a in page.cssselect('.center a') if 'upload' in a.get('href')
    ]
    print len(links)

    dataset = {
        "title": "Maternity and Breastfeeding Data",
        "tags": ["maternity", "breastfeeding"],
        "resources": [anchor_to_resource(a) for a in links],
        "notes": "This collection reports on the number and proportion "\
                 "of women seen and assessed by a healthcare professional "\
                 "within 12 weeks and 6 days of their maternity, the number "\
                 "and proportion of mothers' who have initiated or not "\
                 "initiated breastfeeding and the number and proportion of "\
                 "infants who have been fully, partially or not at all breastfed "\
                 "at 6-8 weeks",
        "origin": ROOT,
        "groups": ['maternity_breastfeeding']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    print dataset
    return [dataset]
Exemple #14
0
def process_para(para, notes):
    title = para.cssselect('strong')[0].text_content()
    if 'CSV Format' in title:  # We'll take the XLS version for now.
        return None

    dataset = {}

    part = title.encode('utf8')[0:7].replace('/', '-')
    s, e = "", ""
    if part == 'England':
        part = 'England Time Series'
    else:
        s, e = date_range_for_year(int(part[0:4]))
    dataset['coverage_start_date'] = s
    dataset['coverage_end_date'] = e

    dataset["title"] = "Mental Health Community Teams Activity - {}".format(part)
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    dataset["origin"] = "http://www.england.nhs.uk/statistics/statistical-work-areas/mental-health-community-teams-activity/"
    dataset["notes"] = notes
    dataset['groups'] = ['mhc']
    links = para.cssselect('a')
    dataset['resources'] = [anchor_to_resource(l) for l in links]

    return dataset
def guidance(page):
    dataset = {
        "title": "Monthly Diagnostic Waiting Times and Activity - Guidance and Documentation",
        "origin": "http://www.england.nhs.uk/statistics/statistical-work-areas/diagnostics-waiting-times-and-activity/monthly-diagnostics-waiting-times-and-activity/",
        "tags": ["waiting times", "statistics"],
        "notes": "",
        "groups": ['mdd']
    }
    dataset['name'] = slugify.slugify(dataset['title']).lower()

    h3s = page.cssselect('h3')
    h3 = filter(lambda x: x.text_content().strip() == "Guidance and Documentation", h3s)[0]
    links = h3.getnext().cssselect('a')
    dataset['resources'] = [anchor_to_resource(l) for l in links]

    p = filter(lambda x: x.text_content().strip() == "Background", h3s)[0]
    desc = []
    while True:
        p = p.getnext()
        if p.tag != 'p':
            break
        desc.append(tostring(p))
    desc = desc[:-1]
    dataset['notes'] = to_markdown(''.join(desc))

    return dataset
Exemple #16
0
def scrape_page(url):
    page = fromstring(requests.get(url).content)

    links = [a for a in page.cssselect('a') if ('upload' in a.get('href')) or ('files' in a.get('href'))]
    h1 = page.cssselect('h1')[1]
    desc = []
    p = h1.getparent().getnext()
    while True:
        if is_header(p):
            break
        desc.append(tostring(p))
        p = p.getnext()

    m = re.match(".*(\d{4}).*", h1.text_content().strip())
    year = int(m.groups()[0])

    dataset = {
        "title": h1.text_content().strip(),
        "tags": ["winter", "sitrep"],
        "resources": [anchor_to_resource(a) for a in links],
        "notes": to_markdown("".join(desc)),
        "origin": url,
        "coverage_start_date": "{}-11-01".format(year),
        "coverage_end_date": "{}-03-01".format(year+1),
        "groups": ["winter"]
    }

    return dataset
def scrape(workspace):
    print "Scraping Child Immunisation with workspace {}".format(workspace)

    html = requests.get(ROOT).content
    page = fromstring(html)

    div = page.cssselect('.center')[0]
    links = div.cssselect('a')[3:]

    h3 = hd([
        h for h in div.cssselect('h3')
        if h.text_content().strip() == "Background"
    ])
    desc = h3.getnext().text_content()

    dataset = {
        "title": "Child Immunisation",
        "notes": to_markdown(fix_bad_unicode(unicode(desc))),
        "coverage_start_date": "",
        "coverage_end_date": "",
        "resources": [],
        "frequency": "Quarterly",
        "origin": ROOT,
        "tags": ["immunisation", "children"],
        "groups": ['child_immunisation']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    earliest_quarter, earliest_year = 4, 9999
    latest_quarter, latest_year = 1, 2000

    for l in links:
        y, q = get_quarter_and_year(l.text_content().strip())
        if y < earliest_year:
            earliest_year = y
            if q < earliest_quarter:
                earliest_quarter = q
        if y > latest_year:
            latest_year = y
            if latest_quarter > q:
                latest_quarter = q

        dataset["resources"].append(anchor_to_resource(l))

    if earliest_quarter == 4:
        earliest_year += 1
    if latest_quarter == 4:
        latest_year += 1
    s, e = QUARTERS[earliest_quarter]
    dataset["coverage_start_date"] = "{}-{}-01".format(earliest_year,
                                                       str(s).zfill(2))
    s, e = QUARTERS[latest_quarter]
    _, last_day = calendar.monthrange(latest_year, s - 1)
    dataset["coverage_end_date"] = "{}-{}-{}".format(earliest_year,
                                                     str(s - 1).zfill(2),
                                                     last_day)

    return [dataset]
Exemple #18
0
def scrape_page(page, url):
    dataset = {
        "title":
        "NHS {}".format(hd(page.cssselect('h1')).text_content().strip()),
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    dataset["origin"] = url
    dataset["tags"] = ["staff survey"]

    year = re.match('.*(\d{4}).*', dataset['title']).groups()[0]
    dataset["coverage_start_date"] = "{}-01-01".format(year)
    dataset["coverage_end_date"] = "{}-12-31".format(year)

    desc_node = page.cssselect('div.column-content p')
    if desc_node:
        dataset["notes"] = hd(desc_node).text_content()
    else:
        dataset["notes"] = "Results for the Staff Survey {year} can be seen below. "\
                           "We have released detailed spreadsheets showing key finding "\
                           "and question level information for each trust who took part "\
                           "in the {year} survey.".format(year=year)
    dataset['notes'] = to_markdown(dataset['notes'])
    dataset["resources"] = []

    boxes = page.cssselect('.document-box')
    for box in boxes:
        a = box.cssselect('a')[0]
        resource = anchor_to_resource(a)
        resource['description'] = box.cssselect('h4')[0].text_content().strip()
        resource['url'] = urljoin(ROOT, resource['url'])
        dataset["resources"].append(resource)

    key = hd([
        a for a in page.cssselect('a')
        if a.text_content().strip() == 'Click here'
    ])
    if key is not None:
        resource = anchor_to_resource(key)
        resource['description'] = "Key Findings"
        resource['url'] = urljoin(ROOT, resource['url'])
        dataset["resources"].append(resource)

    return dataset
Exemple #19
0
def scrape_indicative():
    global INDICATIVE_DESC
    datasets = []
    page = fromstring(
        requests.get(
            "http://www.england.nhs.uk/statistics/statistical-work-areas/health-visitors/indicative-health-visitor-collection-ihvc/"
        ).content)

    desc = []
    guidance_resources = []

    headerPs = page.cssselect('p strong')
    for h in headerPs:
        txt = h.text_content().strip().encode('utf8')
        if txt.startswith("Background"):
            p = h.getparent().getnext()
            while not _is_header(p):
                desc.append(tostring(p))
                p = p.getnext()
        elif txt.startswith("Guidance"):
            p = h.getparent().getnext()
            while not _is_header(p):
                for a in p.cssselect('a'):
                    guidance_resources.append(anchor_to_resource(a))
                p = p.getnext()
        elif MONTH_YEAR_MATCHER.match(txt):
            description = to_markdown("\n".join(desc))
            if not INDICATIVE_DESC:
                INDICATIVE_DESC = description

            resources = []
            p = h.getparent().getnext()
            while not _is_header(p):
                for a in p.cssselect('a'):
                    resources.append(anchor_to_resource(a))
                p = p.getnext()
            datasets.append(
                make_dataset(
                    txt, description, resources + guidance_resources,
                    "http://www.england.nhs.uk/statistics/statistical-work-areas/health-visitors/indicative-health-visitor-collection-ihvc/"
                ))

    return datasets
def get_time_series(h3, url):
    print "Time series..."

    dataset = {
        "title": "Delayed Transfers of Care - Time Series",
        "resources": [anchor_to_resource(l) for l in h3.getnext().cssselect('a')],
        "notes": DEFAULT_NOTES,
        "origin": url,
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    return dataset
Exemple #21
0
def get_time_series(h3, url):
    print "Time series..."

    dataset = {
        "title": "A&E Attendances and Emergency Admissions - Time Series",
        "resources": [anchor_to_resource(l) for l in h3.getnext().cssselect('a')],
        "notes": DEFAULT_NOTES,
        "origin": url,
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    return dataset
Exemple #22
0
def scrape_archived_page(page, url):
    title = "IPMM - {}".format(page.cssselect('.introContent h2')[0].text_content().strip())
    description = page.cssselect('.introText')[0].text_content().strip()

    dataset = {
        "title": title,
        "notes": description,
        "resources": [anchor_to_resource(a) for a in page.cssselect('.internalLink') if a.text_content().strip().startswith('Download')],
        "origin": url,
    }
    return [dataset]
def scrape_indicative():
    global INDICATIVE_DESC
    datasets = []
    page = fromstring(
        requests.get("http://www.england.nhs.uk/statistics/statistical-work-areas/health-visitors/indicative-health-visitor-collection-ihvc/").content)


    desc = []
    guidance_resources = []

    headerPs = page.cssselect('p strong')
    for h in headerPs:
        txt = h.text_content().strip().encode('utf8')
        if txt.startswith("Background"):
            p = h.getparent().getnext()
            while not _is_header(p):
                desc.append(tostring(p))
                p = p.getnext()
        elif txt.startswith("Guidance"):
            p = h.getparent().getnext()
            while not _is_header(p):
                for a in p.cssselect('a'):
                    guidance_resources.append(anchor_to_resource(a))
                p = p.getnext()
        elif MONTH_YEAR_MATCHER.match(txt):
            description = to_markdown("\n".join(desc))
            if not INDICATIVE_DESC:
                INDICATIVE_DESC = description

            resources = []
            p = h.getparent().getnext()
            while not _is_header(p):
                for a in p.cssselect('a'):
                    resources.append(anchor_to_resource(a))
                p = p.getnext()
            datasets.append(make_dataset(txt,
                                         description,
                                         resources + guidance_resources,
                                         "http://www.england.nhs.uk/statistics/statistical-work-areas/health-visitors/indicative-health-visitor-collection-ihvc/"))

    return datasets
def get_time_series(h3, url):
    print "Time series..."

    dataset = {
        "title": "Delayed Transfers of Care - Time Series",
        "resources":
        [anchor_to_resource(l) for l in h3.getnext().cssselect('a')],
        "notes": DEFAULT_NOTES,
        "origin": url,
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    return dataset
Exemple #25
0
def get_time_series(h3, url):
    print "Time series..."

    dataset = {
        "title": "Critical Care Bed Capacity and Urgent Operations Cancelled - Time Series",
        "resources": [anchor_to_resource(l) for l in h3.getnext().cssselect('a')],
        "notes": DEFAULT_NOTES,
        "origin": url,
        "groups": ['ccc']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    return dataset
Exemple #26
0
def process_single_indicator(anchor):
    dataset = {}

    html = requests.get(anchor.get('href')).content
    page = fromstring(html)
    div = page.cssselect('.center')[0]

    dataset['title'] = div.cssselect('h1')[0].text_content().encode('utf8')
    dataset['tags'] = ['ambulance']
    dataset['origin'] = anchor.get('href')
    dataset['name'] = slugify.slugify(dataset['title']).lower()

    s, e = date_range_from_title(dataset['title'])
    dataset['coverage_start_date'] = s
    dataset['coverage_end_date'] = e
    dataset["groups"] =  ['aqi']

    # The notes/description are from h1 to the first <p><strong>....
    desc = []
    start = page.cssselect('p')[0]
    desc.append(tostring(start))

    stop = False
    while not stop:
        start = start.getnext()
        if len(start.cssselect('strong')) > 0:
            stop = True
            break
        desc.append(tostring(start))

    dataset['notes'] = to_markdown(''.join(desc).encode('utf8'))
    dataset['resources'] = []

    def name_replacement(r):
        r['name'] = r['name'].replace('AmbCO', 'Clinical_Outcomes')
        if 'Indicators' in r['name']:
            r['name'] = r['name'].replace('AmbSYS', 'System')
        else:
            r['name'] = r['name'].replace('AmbSYS', 'System_Indicators')

    links = div.cssselect('p a')
    for link in links:
        href = link.get('href')
        if '/statistics/ambulance-quality-indicators/' in href:
            continue
        if '/statistical-work-areas/ambulance-quality-indicators/' in href:
            continue
        if '#Unifypolicy' in href:
            continue
        r = anchor_to_resource(link, post_create_func=name_replacement)
        dataset['resources'].append(r)

    return dataset
def process_single_indicator(anchor):
    dataset = {}

    html = requests.get(anchor.get('href')).content
    page = fromstring(html)
    div = page.cssselect('.center')[0]

    dataset['title'] = div.cssselect('h1')[0].text_content().encode('utf8')
    dataset['tags'] = ['ambulance']
    dataset['origin'] = anchor.get('href')
    dataset['name'] = slugify.slugify(dataset['title']).lower()

    s, e = date_range_from_title(dataset['title'])
    dataset['coverage_start_date'] = s
    dataset['coverage_end_date'] = e
    dataset["groups"] = ['aqi']

    # The notes/description are from h1 to the first <p><strong>....
    desc = []
    start = page.cssselect('p')[0]
    desc.append(tostring(start))

    stop = False
    while not stop:
        start = start.getnext()
        if len(start.cssselect('strong')) > 0:
            stop = True
            break
        desc.append(tostring(start))

    dataset['notes'] = to_markdown(''.join(desc).encode('utf8'))
    dataset['resources'] = []

    def name_replacement(r):
        r['name'] = r['name'].replace('AmbCO', 'Clinical_Outcomes')
        if 'Indicators' in r['name']:
            r['name'] = r['name'].replace('AmbSYS', 'System')
        else:
            r['name'] = r['name'].replace('AmbSYS', 'System_Indicators')

    links = div.cssselect('p a')
    for link in links:
        href = link.get('href')
        if '/statistics/ambulance-quality-indicators/' in href:
            continue
        if '/statistical-work-areas/ambulance-quality-indicators/' in href:
            continue
        if '#Unifypolicy' in href:
            continue
        r = anchor_to_resource(link, post_create_func=name_replacement)
        dataset['resources'].append(r)

    return dataset
def scrape(workspace):
    print "Scraping Child Immunisation with workspace {}".format(workspace)

    html = requests.get(ROOT).content
    page = fromstring(html)

    div = page.cssselect('.center')[0]
    links = div.cssselect('a')[3:]

    h3 = hd([h for h in div.cssselect('h3') if h.text_content().strip() == "Background"])
    desc = h3.getnext().text_content()

    dataset = {
        "title": "Child Immunisation",
        "notes": to_markdown(fix_bad_unicode(unicode(desc))),
        "coverage_start_date": "",
        "coverage_end_date": "",
        "resources": [],
        "frequency": "Quarterly",
        "origin": ROOT,
        "tags": ["immunisation", "children"],
        "groups": ['child_immunisation']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    earliest_quarter, earliest_year = 4, 9999
    latest_quarter, latest_year = 1, 2000

    for l in links:
        y, q = get_quarter_and_year(l.text_content().strip())
        if y < earliest_year:
            earliest_year = y
            if q < earliest_quarter:
                earliest_quarter = q
        if y > latest_year:
            latest_year = y
            if latest_quarter > q:
                latest_quarter = q

        dataset["resources"].append(anchor_to_resource(l))

    if earliest_quarter == 4:
        earliest_year += 1
    if latest_quarter == 4:
        latest_year += 1
    s, e = QUARTERS[earliest_quarter]
    dataset["coverage_start_date"] = "{}-{}-01".format(earliest_year, str(s).zfill(2))
    s, e = QUARTERS[latest_quarter]
    _, last_day = calendar.monthrange(latest_year, s-1)
    dataset["coverage_end_date"] = "{}-{}-{}".format(earliest_year, str(s-1).zfill(2), last_day)

    return [dataset]
def month(url, desc):
    datasets = []
    print "-->", url
    html = requests.get(url).content
    page = fromstring(html)

    # http://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2014/09/Monthly-Diagnostics-Web-File-Timeseries-December-2014.xls
    links = page.cssselect('.center p a')
    trimmed_links = filter(lambda x: x.text_content().strip().startswith('Historical'), links)
    if trimmed_links:
        t = page.cssselect('header h1')[1].text_content()
        dataset = {
            "title": "Monthly Diagnostics Data - Timeseries - {}".format(t[-7:]),
            "origin": url,
            "tags": ["statistics", "diagnostics"],
            "notes": desc,
            "resources": [anchor_to_resource(a) for a in trimmed_links],
            "groups": ['mdd']
        }
        dataset["name"] = slugify.slugify(dataset["title"]).lower()
        datasets.append(dataset)


    links = filter(lambda x: x.text_content().strip().startswith('Monthly'), page.cssselect('.center p a'))
    for first, second in _chunky(links):
        when = re.match('.*\s(.*?\s\d{4}?).*\(.*', first.text_content().strip())
        dataset = {
            "title": "Monthly Diagnostics Data - {}".format(when.groups()[0]),
            "origin": url,
            "tags": ["statistics", "diagnostics"],
            "notes": desc,
            "groups": ['mdd']
        }
        dataset["name"] = slugify.slugify(dataset["title"]).lower()
        dataset["coverage_start_date"],dataset["coverage_end_date"] = date_range_from_string(when.groups()[0])
        dataset['resources'] = [anchor_to_resource(r) for r in [first,second]]
        datasets.append(dataset)

    return datasets
Exemple #30
0
def scrape_page(page, url):
    dataset = {
        "title": "NHS {}".format(hd(page.cssselect('h1')).text_content().strip()),
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    dataset["origin"] = url
    dataset["tags"] = ["staff survey"]

    year = re.match('.*(\d{4}).*', dataset['title']).groups()[0]
    dataset["coverage_start_date"] = "{}-01-01".format(year)
    dataset["coverage_end_date"] = "{}-12-31".format(year)

    desc_node = page.cssselect('div.column-content p')
    if desc_node:
        dataset["notes"] = hd(desc_node).text_content()
    else:
        dataset["notes"] = "Results for the Staff Survey {year} can be seen below. "\
                           "We have released detailed spreadsheets showing key finding "\
                           "and question level information for each trust who took part "\
                           "in the {year} survey.".format(year=year)
    dataset['notes'] = to_markdown(dataset['notes'])
    dataset["resources"] = []

    boxes = page.cssselect('.document-box')
    for box in boxes:
        a = box.cssselect('a')[0]
        resource = anchor_to_resource(a)
        resource['description'] = box.cssselect('h4')[0].text_content().strip()
        resource['url'] = urljoin(ROOT, resource['url'])
        dataset["resources"].append(resource)

    key = hd([a for a in page.cssselect('a') if a.text_content().strip() == 'Click here'])
    if key is not None:
        resource = anchor_to_resource(key)
        resource['description'] = "Key Findings"
        resource['url'] = urljoin(ROOT, resource['url'])
        dataset["resources"].append(resource)

    return dataset
def get_time_series(h3, url):
    print "Time series..."

    dataset = {
        "title":
        "Critical Care Bed Capacity and Urgent Operations Cancelled - Time Series",
        "resources":
        [anchor_to_resource(l) for l in h3.getnext().cssselect('a')],
        "notes": DEFAULT_NOTES,
        "origin": url,
        "groups": ['ccc']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    return dataset
Exemple #32
0
def scrape_block(block, title):
    global DESCRIPTION

    dataset = {
        "title": title,
        "notes": DESCRIPTION,
        "tags": ["sitrep", "winter"],
        "origin": ROOT,
        "resources": [anchor_to_resource(a) for a in block.cssselect('.itemLinks li a')],
        "groups": ['winter']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    for r in dataset["resources"]:
        r['description'] = r['description'].replace('Download ', '')
    return dataset
def process_dataset(title, links, notes):
    dataset = {}

    year_string, start, end = years_to_date_range(title)

    dataset['title'] = ROOT_TITLE.format(year_string)
    dataset['name'] = slugify.slugify(dataset['title']).lower()
    dataset['coverage_start_date'] = start
    dataset['coverage_end_date'] = end
    dataset['notes'] = notes
    dataset['origin'] = ROOT
    dataset['resources'] = [anchor_to_resource(r) for r in links]
    dataset['tags'] = ['elective']
    dataset["groups"] = ['ceo']
    return dataset
def historical_indicative():
    page = fromstring(requests.get(HISTORICAL).content)
    datasets = []

    current_resources = None

    paras = page.cssselect('.center p')
    for p in paras:
        text = p.text_content()
        if MONTH_YEAR_MATCHER.match(text) and not 'website' in text:
            current_resources = [anchor_to_resource(a) for a in p.getnext().cssselect('a')]
            if len(current_resources) > 0:
                datasets.append(make_dataset(text, INDICATIVE_DESC, current_resources, HISTORICAL))

    return datasets
def scrape(workspace):
    print "Scraping MSA with workspace {}".format(workspace)

    datasets = []

    page = requests.get(
        "http://www.england.nhs.uk/statistics/statistical-work-areas/mental-health-community-teams-activity/"
    )
    html = fromstring(page.content)

    center = html.cssselect('.column.center')[0]

    h3s = list(center.cssselect('H3'))
    p = filter(lambda x: x.text_content().startswith('Background'), h3s)[0]

    desc = []
    while True:
        p = p.getnext()
        if not p.tag == 'p':
            break
        desc.append(p.text_content())
    notes = to_markdown(''.join(desc))

    guidance = filter(lambda x: x.text_content().startswith('Guidance'),
                      h3s)[0].getnext().cssselect('a')[0]
    r = anchor_to_resource(guidance)

    data = filter(lambda x: x.text_content().startswith('Data'), h3s)[0]
    paras = []
    while True:
        data = data.getnext()
        if not data.tag == 'p':
            break
        paras.append(data)

    datasets.extend([process_para(p, notes) for p in paras])

    datasets = filter(lambda x: x is not None, datasets)
    # Insert the guidance into each dataset
    for dataset in datasets:
        dataset['resources'].insert(0, r)

    datasets = sorted(datasets, key=lambda x: x['title'])
    print datasets
    return datasets
def scrape_archived_page(page, url):
    title = "IPMM - {}".format(
        page.cssselect('.introContent h2')[0].text_content().strip())
    description = page.cssselect('.introText')[0].text_content().strip()

    dataset = {
        "title":
        title,
        "notes":
        description,
        "resources": [
            anchor_to_resource(a) for a in page.cssselect('.internalLink')
            if a.text_content().strip().startswith('Download')
        ],
        "origin":
        url,
    }
    return [dataset]
def add_singles(page, url):
    links = page.cssselect('.center p a')

    dataset = {
        "title": page.cssselect('h1')[1].text_content().strip(),
        "resources": [],
        "notes": DEFAULT_NOTES,
        "frequency": "Monthly",
        "origin": url,
        "groups": ['delayed_transfer']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    for link in links:
        if not 'Monthly' in link.text_content().strip():
            continue
        dataset["resources"].append(anchor_to_resource(link))

    return dataset
Exemple #38
0
def historical_indicative():
    page = fromstring(requests.get(HISTORICAL).content)
    datasets = []

    current_resources = None

    paras = page.cssselect('.center p')
    for p in paras:
        text = p.text_content()
        if MONTH_YEAR_MATCHER.match(text) and not 'website' in text:
            current_resources = [
                anchor_to_resource(a) for a in p.getnext().cssselect('a')
            ]
            if len(current_resources) > 0:
                datasets.append(
                    make_dataset(text, INDICATIVE_DESC, current_resources,
                                 HISTORICAL))

    return datasets
def scrape_page(url):
    page = fromstring(requests.get(url).content)
    if 'webarchive' in url:
        return scrape_archived_page(page, url)

    datasets = []

    h = page.cssselect('h1')[1]
    title = "IPMM - {}".format(h.text_content().strip())

    desc = []
    p = h.getparent().getnext()
    while True:
        if not p.tag == 'p' or len(p.cssselect('a')) > 0:
            break
        desc.append(tostring(p))
        p = p.getnext()
    description = to_markdown("".join(desc))

    hs = page.cssselect('.center h4')
    if len(hs) < 2:
        hs = page.cssselect('.center h3')

    for h in hs:
        subtitle = "{} - {}".format(title, h.text_content().strip())
        links = h.getnext().cssselect('a')

        m = YEAR_MATCHER.match(h.text_content().strip())
        year_start = int(m.groups()[0])

        dataset = {
            "title": subtitle,
            "notes": description,
            "origin": url,
            "resources": [anchor_to_resource(a) for a in links],
            "coverage_start_date": "{}-04-01".format(year_start),
            "coverage_end_date": "{}-03-31".format(year_start + 1),
            "frequency": "Annually",
            "groups": ['ipmm']
        }
        datasets.append(dataset)

    return datasets
Exemple #40
0
def add_singles(page, url):
    links = page.cssselect('.center p a')

    dataset = {
        "title": page.cssselect('h1')[1].text_content().strip(),
        "resources": [],
        "notes": DEFAULT_NOTES,
        "frequency": "Monthly",
        "origin": url,
        "groups": ['ccc']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    for link in links:
        if not 'Monthly' in link.text_content().strip():
            continue
        dataset["resources"].append(anchor_to_resource(link))

    return dataset
def historical_beds(page, url, title):
    dataset = {}

    desc = page.cssselect('.introText')[0].text_content().strip().encode('utf8')

    sublinks = sorted(page.cssselect('.subLinks a'), key=lambda x: x.text_content().strip())
    print len(sublinks)
    first = int(re.match(".*(\d{4}).*", sublinks[0].text_content()).groups()[0])
    last = int(re.match(".*(\d{4}).*", sublinks[-1].text_content()).groups()[0]) + 1

    dataset["title"] = title
    dataset["origin"] = url
    dataset["coverage_start_date"] = "{}-04-01".format(first)
    dataset["coverage_end_date"] = "{}-03-31".format(last)
    dataset["name"] = slugify.slugify(title).lower()
    dataset["resources"] = [anchor_to_resource(a) for a in sublinks]
    dataset["notes"] = desc
    dataset["groups"] = ['bed_availability']
    return dataset
Exemple #42
0
def scrape_page(url):
    page = fromstring(requests.get(url).content)
    if 'webarchive' in url:
        return scrape_archived_page(page, url)

    datasets = []

    h = page.cssselect('h1')[1]
    title = "IPMM - {}".format(h.text_content().strip())

    desc = []
    p = h.getparent().getnext()
    while True:
        if not p.tag == 'p' or len(p.cssselect('a')) > 0:
            break
        desc.append(tostring(p))
        p = p.getnext()
    description = to_markdown("".join(desc))

    hs = page.cssselect('.center h4')
    if len(hs) < 2:
        hs = page.cssselect('.center h3')

    for h in hs:
        subtitle = "{} - {}".format(title, h.text_content().strip())
        links = h.getnext().cssselect('a')

        m = YEAR_MATCHER.match(h.text_content().strip())
        year_start = int(m.groups()[0])

        dataset = {
            "title": subtitle,
            "notes": description,
            "origin": url,
            "resources": [anchor_to_resource(a) for a in links],
            "coverage_start_date": "{}-04-01".format(year_start),
            "coverage_end_date": "{}-03-31".format(year_start+1),
            "frequency": "Annually",
            "groups": ['ipmm']
        }
        datasets.append(dataset)

    return datasets
Exemple #43
0
def scrape(workspace):
    print "Scraping MSA with workspace {}".format(workspace)

    datasets = []

    page = requests.get("http://www.england.nhs.uk/statistics/statistical-work-areas/mental-health-community-teams-activity/")
    html = fromstring(page.content)

    center = html.cssselect('.column.center')[0]

    h3s = list(center.cssselect('H3'))
    p = filter(lambda x: x.text_content().startswith('Background'), h3s)[0]

    desc = []
    while True:
        p = p.getnext()
        if not p.tag == 'p':
            break
        desc.append(p.text_content())
    notes = to_markdown(''.join(desc))

    guidance = filter(lambda x: x.text_content().startswith('Guidance'), h3s)[0].getnext().cssselect('a')[0]
    r = anchor_to_resource(guidance)

    data = filter(lambda x: x.text_content().startswith('Data'), h3s)[0]
    paras = []
    while True:
        data = data.getnext()
        if not data.tag == 'p':
            break
        paras.append(data)

    datasets.extend([process_para(p, notes) for p in paras])

    datasets = filter(lambda x: x is not None, datasets)
    # Insert the guidance into each dataset
    for dataset in datasets:
        dataset['resources'].insert(0, r)

    datasets = sorted(datasets, key=lambda x:x['title'])
    print datasets
    return datasets
def add_year_block(header, url):
    m = re.match("(.*)(\d{4})", header.text_content().strip())
    h3 = header

    if h3.getnext() is None:
        # Sometimes the header is hidden in a div. Sigh.
        h3 = h3.getparent()

    links = []
    while h3 is not None:
        h3 = h3.getnext()
        if h3 is None or h3.tag != "p":
            break
        links.extend(h3.cssselect('a'))

    year = m.groups()[1]
    import string
    month = filter(lambda x: x in string.printable, m.groups()[0].strip())

    dataset = {
        "title":
        u"Critical Care Bed Capacity and Urgent Operations Cancelled - {} {}".
        format(month, year),
        "resources": [anchor_to_resource(l) for l in links],
        "notes":
        DEFAULT_NOTES,
        "origin":
        url,
        "frequency":
        "Monthly",
        "groups": ['ccc']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()

    mnth = list(calendar.month_name).index(month)
    _, e = calendar.monthrange(int(m.groups()[1]), mnth)
    dataset['coverage_start_date'] = "{}-{}-01".format(m.groups()[1].strip(),
                                                       mnth)
    dataset['coverage_end_date'] = "{}-{}-{}".format(m.groups()[1].strip(),
                                                     mnth, e)

    return dataset
def scrape_metrics():
    datasets = []
    page = fromstring(requests.get(METRICS_URL).content)

    matcher = re.compile(".*Q(\d{1})\s(\d{4})-(\d{2}).*")

    h1 = page.cssselect('h1')[1].getparent()
    title = h1.text_content().strip()

    desc = []
    while True:
        h1 = h1.getnext()
        if len(h1.cssselect('strong')) > 0:
            break
        desc.append(tostring(h1))
    description = to_markdown("".join(desc))

    links = page.cssselect('p a')
    unsorted_links = collections.defaultdict(list)
    for l in links:
        m = matcher.match(l.text_content())
        if not m:
            continue
        k = "{}-{}".format(m.groups()[1], m.groups()[2])
        unsorted_links[k].append( [l, m.groups()] )


    for k, v in unsorted_links.iteritems():
        dataset = {
            "title": "{} {}".format(title, k),
            "notes": description,
            "origin": METRICS_URL,
            "resources": [],
            "frequency": "Quarterly"
        }
        dataset["name"] = slugify.slugify(dataset["title"]).lower()
        for link, time_tuple in v:
            dataset["resources"].append(anchor_to_resource(link))
        datasets.append(dataset)

    return datasets
Exemple #46
0
def process_latest(datasets, latest):
    """
    We process the latest data as a special case because it is
    all munged together in a separate block.  We need to find the
    links, parse them, try and group them by name, and then decide
    how we're going to label the dataset.
    """
    for anchor in latest:
        resource = anchor_to_resource(anchor)
        y = int(string_to_date(resource['description'])[:4])
        finder = "{}-{}".format(y-1, str(y)[2:4])
        finder = "{} - {}".format(TITLE_ROOT, finder)

        # We can find the first dataset in the list (datasets) whose
        # title starts with finder as the most recent years go at
        # the top of the list on the page.
        for dataset in datasets:
            if dataset['title'].startswith(finder):
                print "We think ", resource['description'], "goes in", dataset['title']
                dataset['resources'].insert(0, resource)
                break
def scrape(workspace):
    print "Scraping Direct Access to Audiology with workspace {}".format(
        workspace)
    datasets = []

    html = requests.get(ROOT).content
    page = fromstring(html)

    desc = page.cssselect('h1')[1].getparent().getnext().text_content().strip()

    def is_header_div(d):
        return d is None or d.tag == 'h3' or\
            (d.tag == 'div' and len(d.cssselect('h3')) == 1)

    h3s = page.cssselect('h3')
    for h3 in h3s:
        title = h3.text_content().strip()
        container = []
        while h3 is not None:
            h3 = h3.getnext()
            if is_header_div(h3):
                break
            container.extend(h3.cssselect('a'))

        dataset = {
            "title": "Direct Access Audiology Data - {}".format(title),
            "resources": [anchor_to_resource(l) for l in container],
            "origin": ROOT,
            "notes": desc,
            "tags": ["audiology"],
            "groups": ['direct_access_audiology']
        }
        s, e = date_range_from_string(title)
        dataset["coverage_start_date"] = s
        dataset["coverage_end_date"] = e
        dataset["name"] = slugify.slugify(dataset["title"]).lower()
        datasets.append(dataset)

    datasets = filter(lambda x: x is not None, datasets)
    return datasets
Exemple #48
0
def scrape_metrics():
    datasets = []
    page = fromstring(requests.get(METRICS_URL).content)

    matcher = re.compile(".*Q(\d{1})\s(\d{4})-(\d{2}).*")

    h1 = page.cssselect('h1')[1].getparent()
    title = h1.text_content().strip()

    desc = []
    while True:
        h1 = h1.getnext()
        if len(h1.cssselect('strong')) > 0:
            break
        desc.append(tostring(h1))
    description = to_markdown("".join(desc))

    links = page.cssselect('p a')
    unsorted_links = collections.defaultdict(list)
    for l in links:
        m = matcher.match(l.text_content())
        if not m:
            continue
        k = "{}-{}".format(m.groups()[1], m.groups()[2])
        unsorted_links[k].append([l, m.groups()])

    for k, v in unsorted_links.iteritems():
        dataset = {
            "title": "{} {}".format(title, k),
            "notes": description,
            "origin": METRICS_URL,
            "resources": [],
            "frequency": "Quarterly"
        }
        dataset["name"] = slugify.slugify(dataset["title"]).lower()
        for link, time_tuple in v:
            dataset["resources"].append(anchor_to_resource(link))
        datasets.append(dataset)

    return datasets
def scrape(workspace):
    print "Scraping Direct Access to Audiology with workspace {}".format(workspace)
    datasets = []

    html = requests.get(ROOT).content
    page = fromstring(html)

    desc = page.cssselect('h1')[1].getparent().getnext().text_content().strip()

    def is_header_div(d):
        return d is None or d.tag == 'h3' or\
            (d.tag == 'div' and len(d.cssselect('h3')) == 1)

    h3s = page.cssselect('h3')
    for h3 in h3s:
        title = h3.text_content().strip()
        container = []
        while h3 is not None:
            h3 = h3.getnext()
            if is_header_div(h3):
                break
            container.extend(h3.cssselect('a'))

        dataset = {
            "title": "Direct Access Audiology Data - {}".format(title),
            "resources": [anchor_to_resource(l) for l in container],
            "origin": ROOT,
            "notes": desc,
            "tags": ["audiology"],
            "groups": ['direct_access_audiology']
        }
        s, e = date_range_from_string(title)
        dataset["coverage_start_date"] = s
        dataset["coverage_end_date"] = e
        dataset["name"] = slugify.slugify(dataset["title"]).lower()
        datasets.append(dataset)


    datasets = filter(lambda x: x is not None, datasets)
    return datasets
def process_latest(datasets, latest):
    """
    We process the latest data as a special case because it is
    all munged together in a separate block.  We need to find the
    links, parse them, try and group them by name, and then decide
    how we're going to label the dataset.
    """
    for anchor in latest:
        resource = anchor_to_resource(anchor)
        y = int(string_to_date(resource['description'])[:4])
        finder = "{}-{}".format(y - 1, str(y)[2:4])
        finder = "{} - {}".format(TITLE_ROOT, finder)

        # We can find the first dataset in the list (datasets) whose
        # title starts with finder as the most recent years go at
        # the top of the list on the page.
        for dataset in datasets:
            if dataset['title'].startswith(finder):
                print "We think ", resource['description'], "goes in", dataset[
                    'title']
                dataset['resources'].insert(0, resource)
                break
Exemple #51
0
def process_block(p, title, description, current_year):
    if not current_year:
        return None

    dataset = {
        "title": "{} - {}".format(TITLE_ROOT, title),
        "notes": description,
        "tags": ["Statistics", current_year],
        "resources": [],
        "origin": "http://www.england.nhs.uk/statistics/statistical-work-areas/mixed-sex-accommodation/msa-data/",
        "groups": ["msa"]
    }

    for resource in p.cssselect('a'):
        r = anchor_to_resource(resource)
        if r['format'] == 'XLSM':
            r['format'] = 'XLS'
        dataset["resources"].append(r)

    dataset["name"] = slugify.slugify(dataset['title']).lower()

    return dataset
def process(page, url):
    desc = "Annual Imaging and Radiodiagnostics data relate to the number of imaging "\
           "and radiological examinations or tests carried out in the NHS in England "\
           "during each year. Data for this collection is available back to 1995-96."
    title = "Annual Imaging and Radiodiagnostics Data"
    dataset = {
        "title": title,
        "name": slugify.slugify(title).lower(),
        "origin": url,
        "notes": desc,
        "resources": [],
        "groups": ['aird'],
    }

    links = page.cssselect('.center p a')
    for link in links:
        href = link.get('href')
        ext = href[-3:]
        if ext in ['xls', 'doc', 'pdf']:
            dataset['resources'].append(anchor_to_resource(link))

    return dataset
def scrape(workspace):
    print "Scraping Delayed Transfer {}".format(workspace)
    global DEFAULT_NOTES

    html = requests.get(ROOT)
    page = fromstring(html.content)
    default_notes(page)

    h3 = hd([
        h for h in page.cssselect('h3') if h.text_content().strip() == 'Data'
    ])
    links = h3.getnext().cssselect('a')

    datasets = []
    datasets.extend(scrape_page(links[-1].get("href")))
    for l in links:
        datasets.extend(scrape_page(l.get("href")))

    # Get the annual statistical reports
    h3 = hd([
        h for h in page.cssselect('h3')
        if h.text_content().strip() == 'Annual Statistical Report'
    ])
    links = h3.getnext().cssselect('a')
    dataset = {
        "resources": [anchor_to_resource(l) for l in links],
        "title": "Delayed Transfers of Care - Annual Statistical Reports",
        "origin": ROOT,
        "notes": DEFAULT_NOTES,
        "frequency": "Annually",
        "groups": ['delayed_transfer']
    }
    dataset["name"] = slugify.slugify(dataset["title"]).lower()
    datasets.append(dataset)

    datasets = filter(lambda x: x is not None, datasets)
    print "Processed {} datasets".format(len(datasets))
    return datasets
def current_beds(page, url, title):
    datasets = []

    div = page.cssselect('.center')[0]
    desc = div.cssselect('p')[0].text_content().strip()

    all_resources = [
        anchor_to_resource(a) for a in div.cssselect('p a')
        if 'XLS' in a.text_content()
    ]
    grouped = collections.defaultdict(list)

    for resource in all_resources:
        if "Time" in resource['description']:
            grouped['TimeSeries'].append(resource)
            continue

        yr = YR_MATCH.match(resource['description']).groups()[0]
        grouped[yr].append(resource)

    for y in sorted(grouped.keys()):
        dataset = {}
        if y == 'TimeSeries':
            dataset["title"] = "{} - Timeseries".format(title)
        else:
            dataset["title"] = "{} {}-{}".format(title, y, int(y) + 1)
            dataset["coverage_start_date"] = "{}-04-01".format(y)
            dataset["coverage_end_date"] = "{}-03-31".format(int(y) + 1)

        dataset["name"] = slugify.slugify(dataset["title"]).lower()
        dataset["origin"] = url
        dataset["tags"] = ["bed availability"]
        dataset["notes"] = desc
        dataset["resources"] = grouped[y]
        dataset["groups"] = ['bed_availability']
        datasets.append(dataset)

    return datasets