def extract_table_data(pct_name,s,facility_type):
    """
    Extracts data from a list of PCT facilities
    """

    services = []
    d = {}
    for t in s.getchildren():
        if t.tag=="dt":
            if d != {}:
                services.append(d)
            d = {"PCT":pct_name, "type":"service"}
            u = t.find("a")
            if u != None:
                t = u
                d["info HTML"] = "http://www.nhs.uk" + t.attrib["href"]
            name = (t.text or "").strip()
            d["name"] = name
            print name
        elif t.text[:4]=="tel:":
            d["telephone"]=t.text[5:]
        else:
            address = t.text
            d["address"] = address
            postcode = geo.extract_gb_postcode(address)
            d["postcode"] = postcode
            d["latlng"] = geo.gb_postcode_to_latlng(postcode)
            
    for d in services:
        if "info HTML" in d:
            scrape_extra(d,facility_type)
        datastore.save(unique_keys=["PCT","type","name","address"], data=d)
Ejemplo n.º 2
0
 def latlng(self):
     from scraperwiki import geo
     if self[u'Address of Proposal']:
         return geo.gb_postcode_to_latlng(
             geo.extract_gb_postcode(self[u'Address of Proposal']))
     else:
         return None
def extract_table_data(pct_name, s, facility_type):
    """
    Extracts data from a list of PCT facilities
    """

    services = []
    d = {}
    for t in s.getchildren():
        if t.tag == "dt":
            if d != {}:
                services.append(d)
            d = {"PCT": pct_name, "type": "service"}
            u = t.find("a")
            if u != None:
                t = u
                d["info HTML"] = "http://www.nhs.uk" + t.attrib["href"]
            name = (t.text or "").strip()
            d["name"] = name
            print name
        elif t.text[:4] == "tel:":
            d["telephone"] = t.text[5:]
        else:
            address = t.text
            d["address"] = address
            postcode = geo.extract_gb_postcode(address)
            d["postcode"] = postcode
            d["latlng"] = geo.gb_postcode_to_latlng(postcode)

    for d in services:
        if "info HTML" in d:
            scrape_extra(d, facility_type)
        datastore.save(unique_keys=["PCT", "type", "name", "address"], data=d)
def scrape_pct(link, pct_name):
    """
    Scrapes the data associated with the PCT, and calls functions to scrape
    data associated with the services.
    """

    print
    print
    print pct_name
    print "-" * len(pct_name)

    url = "http://www.nhs.uk" + link
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(scrape(url))
    root = page.getroot()

    d = {}

    # basic contact details
    d["PCT"] = pct_name
    d["type"] = "main"
    d["name"] = pct_name
    address = root.find("body/div/form/div/div/p").text
    d["address"] = address
    postcode = geo.extract_gb_postcode(address)
    d["postcode"] = postcode
    d["latlng"] = geo.gb_postcode_to_latlng(postcode)
    d["info HTML"] = url

    # quality
    for t in root.findall(
            "body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']"
    ):
        k = t.find("div/h4").text.strip()
        v = t.find("div/img").attrib["alt"]
        d[k] = v

    # head honcho
    for t in root.findall(
            "body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']"
    ):
        d["Boss"] = t.text.replace("<br />", ", ")

    # boring text
    for t in root.findall("body/div/form/div/div/div/div/div/div/p"):
        if t.text:
            if t.attrib.get("class", False) == "intro":
                d["intro text"] = t.text
            else:
                d["boilerplate"] = d.get("boilerplate", "") + "\n" + t.text

    datastore.save(unique_keys=["PCT", "type", "name", "address"],
                   data=d,
                   latlng=d.get("latlng"))

    scrape_facilities(pct_name, root)
    scrape_others(pct_name, url)
def scrape_pct(link,pct_name):
    """
    Scrapes the data associated with the PCT, and calls functions to scrape
    data associated with the services.
    """
    
    print
    print
    print pct_name
    print "-"*len(pct_name)

    url = "http://www.nhs.uk" + link
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(scrape(url))
    root = page.getroot()

    d = {}

    # basic contact details
    d["PCT"] = pct_name
    d["type"] = "main"
    d["name"] = pct_name
    address = root.find("body/div/form/div/div/p").text
    d["address"] = address
    postcode = geo.extract_gb_postcode(address)
    d["postcode"] = postcode
    d["latlng"] = geo.gb_postcode_to_latlng(postcode)
    d["info HTML"] = url

    # quality
    for t in root.findall("body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']"):
        k = t.find("div/h4").text.strip()
        v = t.find("div/img").attrib["alt"]
        d[k] = v

    # head honcho
    for t in root.findall("body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']"):
        d["Boss"] = t.text.replace("<br />",", ")

    # boring text
    for t in root.findall("body/div/form/div/div/div/div/div/div/p"):
        if t.text:
            if t.attrib.get("class",False)=="intro":
                d["intro text"] = t.text
            else:
                d["boilerplate"] = d.get("boilerplate","")+"\n"+t.text

    datastore.save(unique_keys=["PCT","type","name","address"], data=d, latlng=d.get("latlng"))

    scrape_facilities(pct_name,root)
    scrape_others(pct_name,url)
def scrape_pct(link,pct_name):
    """
    Scrapes the data associated with the PCT, and calls functions to scrape
    data associated with the services.
    """
    
    url = "http://www.nhs.uk" + link
    root = lxml.html.parse(url).getroot()

    d = {}

    # basic contact details
    d["PCT"] = pct_name
    d["type"] = "main"
    d["name"] = pct_name
    print lxml.html.tostring(root)
    address = root.cssselect("div.panel-content div.pad p")[0].text
    d["address"] = address
    d["postcode"]= geo.extract_gb_postcode(address)
    try:
        d["lat"], d["lng"] = geo.gb_postcode_to_latlng(d["postcode"])
    except:
        print "Postcode not found", d["postcode"]
    d["info HTML"] = url

    colour = "green"
    # quality
    for t in root.findall("body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']"):
        k = t.find("div/h4").text.strip()
        v = t.find("div/img").attrib["alt"]
        d[k] = v
        if k == "Fair":
            colour = "yellow"
    d["colour"] = colour

    # head honcho
    for t in root.findall("body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']"):
        d["Boss"] = t.text.replace("<br />",", ")

    # boring text
    for t in root.findall("body/div/form/div/div/div/div/div/div/p"):
        if t.text:
            if t.attrib.get("class",False)=="intro":
                d["intro text"] = t.text
            else:
                d["boilerplate"] = (d.get("boilerplate","")+"\n"+t.text).strip()

    sqlite.save(unique_keys=["PCT","type","name"], data=d)
    
    scrape_facilities(pct_name,root)
    scrape_others(pct_name,url)
def entries_from_doc(doc):
    for row in doc.xpath('//div[@class="tablecontainer"]//tr'):
        columns  = [col.text_content().strip() for col in row.xpath('./td')]
        if not columns:
            continue
        entry = {name: value for name, value in zip(COLUMN_NAMES, columns)}
        try:
            entry['valid_date'] = datetime.strptime(entry['valid_date'], DATE_FORMAT)
        except ValueError:
            pass
        entry['details_url'] = urljoin(URL, row.xpath('.//a/@href')[0])
        postcode = extract_gb_postcode(entry['site_location'])
        if postcode:        
            entry['postcode'] = postcode
            #latlng = gb_postcode_to_latlng(postcode)
            #if latlng:            
            #    entry['lat'], entry['lng'] = latlng
        yield entry
def tests():
    pc = extract_gb_postcode('10 Romford Road Preston Lancashire')
    print pc, gb_postcode_to_latlng(pc)
 def latlng(self):
     from scraperwiki import geo
     return geo.gb_postcode_to_latlng(
         geo.extract_gb_postcode(self[u'Address of Proposal']))