def deep_scrape(urn):
    print "URN: %s" % urn
    keyvaluepairs = {}

    def merge_in(d):
        "update keyvaluepairs with d; complain if anything is overwritten"
        for (k, v) in d.iteritems():
            if k in keyvaluepairs:
                assert keyvaluepairs[k] == v
            else:
                keyvaluepairs[k] = v

    merge_in(summary_scrape(urn))
    merge_in(page_scrape("general", urn))
    merge_in(page_scrape("school-characterisics", urn))
    merge_in(page_scrape("links", urn))
    merge_in(page_scrape("sen", urn))
    merge_in(page_scrape("pru", urn))
    merge_in(page_scrape("quality-indicators", urn))
    merge_in(page_scrape("communications", urn))
    merge_in(page_scrape("census-data", urn))
    merge_in(page_scrape("regional-indicators", urn))

    datastore.save(unique_keys=["URN"], data=keyvaluepairs)
    print
def parse_page(page, url):
    for table in page.findAll('table', {'id':'caselist'}):
        for row in table.findAll('tr')[1:]:
            if row['class'].find('last') < 0:
                cells = row.findAll('td')
                
                handed_down = cells[0].string
                neutral_citation = cells[1].string
                case_id = cells[2].string            
                case_name = cells[3].contents[1].string
                court = ''                                    
     
                # return absolute urls, they are WAY more useful.
                judgment_pdf_link = urlparse.urljoin(url, cells[4].findAll('a', title=Jr)[0]['href'])
                press_summary_link = urlparse.urljoin(url, cells[4].findAll('a', title=PSr)[0]['href'])
    
            #save to datastore
            data = {
                    'case_name' : case_name,
                    'handed_down' : handed_down,
                    'case_id' : case_id,
                    'neutral_citation' : neutral_citation,
                    'judgment_pdf_link' : judgment_pdf_link,
                    'press_summary_link' : press_summary_link
                    }
            datastore.save(unique_keys=['case_id'], data=data)
Beispiel #3
0
def deep_scrape(urn):
    print "URN: %s" % urn
    keyvaluepairs = {}

    def merge_in(d):
        "update keyvaluepairs with d; complain if anything is overwritten"
        for (k, v) in d.iteritems():
            if k in keyvaluepairs:
                assert keyvaluepairs[k] == v
            else:
                keyvaluepairs[k] = v

    merge_in(summary_scrape(urn))
    merge_in(page_scrape('general', urn))
    merge_in(page_scrape('school-characterisics', urn))
    merge_in(page_scrape('links', urn))
    merge_in(page_scrape('sen', urn))
    merge_in(page_scrape('pru', urn))
    merge_in(page_scrape('quality-indicators', urn))
    merge_in(page_scrape('communications', urn))
    merge_in(page_scrape('census-data', urn))
    merge_in(page_scrape('regional-indicators', urn))

    datastore.save(unique_keys=["URN"], data=keyvaluepairs)
    print
Beispiel #4
0
def main():

    page = html.parse(
        "http://www.manchester.gov.uk/schools/type/All/page/1/records/100000")

    for tr in page.findall("body/div/div/div/table/tr"):

        cols = tr.findall("td")
        if len(cols) != 4:
            continue
        (a, b, c, d) = cols

        data = {}

        l = a.find("p/a")
        data["School link"] = l.attrib["href"]
        data["Schoolname"] = l.text
        data["Address"] = " / ".join(
            (t.tail or "").strip() for t in a.findall("p/br"))

        data["Headteacher"] = b.text

        data["Phone number"] = c.find("p").text
        data["Fax number"] = c.find("p/strong").tail
        data["Email address"] = c.find("p/a").text

        for l in d.findall("a"):
            data[l.text] = l.attrib["href"]

        print data["Schoolname"]
        datastore.save(data=data, unique_keys=["Schoolname"])
def main():
    #scrape page
    borough_html = scraperwiki.scrape('http://maps.met.police.uk/php/dataview.php?area=MPS&ct=8')
    borough_page = BeautifulSoup.BeautifulSoup(borough_html)
    boroughs = extract_areas(borough_page)

    for borough in boroughs:
        ward_html = scraperwiki.scrape(borough['area_link'])
        ward_page = BeautifulSoup.BeautifulSoup(ward_html)
        wards = extract_areas(ward_page)
        for ward in wards:
            sub_ward_html = scraperwiki.scrape(ward['area_link'])
            sub_ward_page = BeautifulSoup.BeautifulSoup(sub_ward_html)
            sub_wards = extract_areas(sub_ward_page) 

            for sub_ward in sub_wards:
                crimes = extract_crime(sub_ward['area_link'])
                for crime in crimes:
                    
                    data = {
                        'borough' : borough['area_name'],
                        'ward' : ward['area_name'],
                        'sub_ward' : sub_ward['area_name'],
                        'super_output_area_code' : sub_ward['area_id'],                            
                        'month': crime['month'],
                        'crime_type': crime['crime_type'],
                        'crime_rate': crime['crime_rate'],
                        'crime_count': crime['crime_count'],                            
                        }

                    datastore.save(unique_keys=['super_output_area_code', 'month', 'crime_type'], data=data)
def process():
    for url,offset in sources:
        book = xlrd.open_workbook(file_contents=scrape(url))
        sheet = book.sheets()[0]
                         
        for row in range(0,sheet.nrows):
            for column in range(0,sheet.ncols):
                cell = sheet.cell(row,column)
                yearRange = getYearRange(cell)        
                if yearRange:                
                    rowCursor = row
                    while True:
                        rowCursor += 1
                        startIncome,endIncome = getIncomeRange(sheet.cell(rowCursor,column))
                        data = {
                                    'url'                : url,
                                    'incomeCoordinate'   : getCoordinate(rowCursor,column),
                                    'taxCoordinate'      : getCoordinate(rowCursor,column+offset),
                                    'yearRange'          : yearRange,
                                    'startIncome'        : startIncome,
                                    'endIncome'          : endIncome,
                                    'taxRate'            : sheet.cell(rowCursor,column+offset).value
                               }
                        if startIncome or endIncome:
                              print data
                              datastore.save(['url','incomeCoordinate','taxCoordinate'],data)
                        if startIncome and not endIncome:
                              break
def main():
    
    page = html.parse("http://www.manchester.gov.uk/schools/type/All/page/1/records/100000")

    for tr in page.findall("body/div/div/div/table/tr"):

        cols = tr.findall("td")
        if len(cols) != 4:
            continue
        (a,b,c,d) = cols

        data = {}

        l = a.find("p/a")
        data["School link"] = l.attrib["href"]
        data["Schoolname"] = l.text
        data["Address"] = " / ".join((t.tail or "").strip() for t in a.findall("p/br"))

        data["Headteacher"] = b.text

        data["Phone number"] = c.find("p").text
        data["Fax number"] = c.find("p/strong").tail
        data["Email address"] = c.find("p/a").text

        for l in d.findall("a"):
            data[l.text] = l.attrib["href"]

        print data["Schoolname"]
        datastore.save(data=data, unique_keys=["Schoolname"])
Beispiel #8
0
def parse_page(html, id):  # parse LA specific page
    la_page = BeautifulSoup(html.read())
    eo_det = la_page.find('div', {'class': 'yourOffice'})
    eo = {}
    eo['id'] = id
    address = [
        a.strip() for a in str(eo_det.find('p')).strip().split('<br />')
    ]
    address = address[1:-2]
    eo['address1'] = address[0]
    eo['address2'] = address[1]
    eo['address3'] = address[2]
    eo['address4'] = address[3]
    eo['postcode'] = address[4]
    try:
        eo['phone'] = address[5]
    except:
        pass
    # latlng = scraperwiki.geo.gb_postcode_to_latlng(eo['postcode']) # seems broke for now :[
    # print latlng
    h = eo_det.findAll('a')
    eo['local_authority'] = h[0].text
    eo['url'] = h[0]['href']
    if len(h) > 1:
        eo['email'] = re.match('^mailto:(.*)', h[1]['href']).groups()[0]
    # save
    datastore.save(unique_keys=['id'], data=eo)
    print eo
def schoolscrape(schoolname, schoolurl):

    schoolpage = BeautifulSoup(scrape(schoolurl))

    keyvalues = {}

    def addkeyvaluepair(k, v):
        print k + ": " + v
        keyvalues[k] = v

    addkeyvaluepair("schoolname", schoolname)

    # there's some extra data in the HTML comments which currently goes missed
    for label in schoolpage.findAll("div", {"class": "ecol1"}):
        attrib = tagcontents_to_string(label).rstrip(":")
        if attrib == "Address":
            field = label.findNextSibling("div", {"class": "ecol2"})
            while field.br:
                field.br.extract()
            lines = [str(x) for x in field.contents]
            postcode = postcode_format(str(lines[-1]).replace("&nbsp;", ""))
            addkeyvaluepair("Postcode", postcode)
            address = " / ".join([l.rstrip(", ") for l in lines[:-1]])
            addkeyvaluepair("Address", address)
        else:
            value = tagcontents_to_string(
                label.findNextSibling("div", {"class": "ecol2"}))
            addkeyvaluepair(attrib, value)

    print ""

    datastore.save(unique_keys=["schoolname"], data=keyvalues)
def parse_page(page):
    for table in page.findAll('table', {'id':'caselist'}):
        for row in table.findAll('tr')[1:]:
            if row['class'].find('last') < 0:
                cells = row.findAll('td')
                
                handed_down = cells[0].string
                neutral_citation = cells[1].string
                case_id = cells[2].string            
                case_name = cells[3].contents[1].string
                court = ''
                if len(cells[3].contents) == 5:
                    court = cells[3].contents[4]
                                    
                judgment_pdf_link = cells[4].findAll('a', title=Jr)[0]['href']
                press_summary_link = cells[4].findAll('a', title=PSr)[0]['href']
    
            #save to datastore
            data = {
                    'case_name' : case_name,
                    'source_of_appeal' : court,  
                    'handed_down' : handed_down,
                    'case_id' : case_id,
                    'neutral_citation' : neutral_citation,
                    'judgment_pdf_link' : judgment_pdf_link,
                    'press_summary_link' : press_summary_link
                    }
            datastore.save(unique_keys=['case_id'], data=data)
Beispiel #11
0
def parse_orgs(institution_list):

    ins = institution_list.findAll('tr', {
        'class': 'tHigh',
        'class': 'tLow',
    })

    cls_map = {
        'dc2': 'institution',
        'dc4': 'current_grants',
        'dc5': 'announced_grants_total',
    }
    # loop through all rows
    for i in ins:
        institution = {}
        link = i.find('a', {'class': 'noUndStd'})
        institution['stfc_url'] = base_url + link['href']
        institution['id'] = re.match('.*in=(-?\d+)$',
                                     institution['stfc_url']).group(1)
        print institution['id']

        for cell_cls, name in cls_map.iteritems():
            institution[name] = i.find('td', {'class': cell_cls}).text.strip()

        institution['announced_grants_total'] = int(
            institution['announced_grants_total'].replace(',', ''))
        datastore.save(unique_keys=['id'], data=institution)
        print institution
def scrape_constituency(seat, url):
    html = scraperwiki.scrape(url)
    page = BeautifulSoup.BeautifulSoup(html)
    # there's all sorts of stuff on this page.  I couldn't find
    # a value for the total electorate, although it might be here.
    # There is a turnout line, with a percentage value, from which
    # one could back-compute the electorate.  I don't do that yet.
    table = page.find('table', attrs={'class': 'candidate-detail'})
    for candidate_row in table.tbody.findAll('tr'):
        print candidate_row
        items = candidate_row.findAll('td')
        party_class = candidate_row['class']
        # unlike the rest of the scrape, here we do hard-coded indexes.
        name = items[0].span.string.strip()
        party = items[1].string.strip()
        votes_string = items[2].string.replace(',', '')
        try:
            votes = int(votes_string)
        except:
            votes = None
        data = {
            'seat': seat,
            'candidate': name,
            'party': party,
            'votes': votes
        }
        datastore.save(unique_keys=['seat', 'candidate', 'party'], data=data)
    datastore.save(unique_keys=['seat'], data={'seat': seat, 'done': True})
def extract_table_data(pct_name, s, facility_type):
    """
    Extracts data from a list of PCT facilities
    """

    services = []
    d = {}
    for t in s.getchildren():
        if t.tag == "dt":
            if d != {}:
                services.append(d)
            d = {"PCT": pct_name, "type": "service"}
            u = t.find("a")
            if u != None:
                t = u
                d["info HTML"] = "http://www.nhs.uk" + t.attrib["href"]
            name = (t.text or "").strip()
            d["name"] = name
            print name
        elif t.text[:4] == "tel:":
            d["telephone"] = t.text[5:]
        else:
            address = t.text
            d["address"] = address
            postcode = geo.extract_gb_postcode(address)
            d["postcode"] = postcode
            d["latlng"] = geo.gb_postcode_to_latlng(postcode)

    for d in services:
        if "info HTML" in d:
            scrape_extra(d, facility_type)
        datastore.save(unique_keys=["PCT", "type", "name", "address"], data=d)
Beispiel #14
0
def main():
    #scrape page
    borough_html = scraperwiki.scrape(
        'http://maps.met.police.uk/php/dataview.php?area=MPS&ct=8')
    borough_page = BeautifulSoup.BeautifulSoup(borough_html)
    boroughs = extract_areas(borough_page)

    for borough in boroughs:
        ward_html = scraperwiki.scrape(borough['area_link'])
        ward_page = BeautifulSoup.BeautifulSoup(ward_html)
        wards = extract_areas(ward_page)
        for ward in wards:
            sub_ward_html = scraperwiki.scrape(ward['area_link'])
            sub_ward_page = BeautifulSoup.BeautifulSoup(sub_ward_html)
            sub_wards = extract_areas(sub_ward_page)

            for sub_ward in sub_wards:
                crimes = extract_crime(sub_ward['area_link'])
                for crime in crimes:

                    data = {
                        'borough': borough['area_name'],
                        'ward': ward['area_name'],
                        'sub_ward': sub_ward['area_name'],
                        'super_output_area_code': sub_ward['area_id'],
                        'month': crime['month'],
                        'crime_type': crime['crime_type'],
                        'crime_rate': crime['crime_rate'],
                        'crime_count': crime['crime_count'],
                    }

                    datastore.save(unique_keys=[
                        'super_output_area_code', 'month', 'crime_type'
                    ],
                                   data=data)
def parse_page(page):

    #find each row on this page
    for table in page.findAll('table', {'class': 't18Standard'}):
        for row in table.findAll('tr')[1:]:

            #strip out the details of each gift
            person_name = row.contents[0].string
            date_as_listed = row.contents[1].string
            detail_of_gift = row.contents[2].string
            donor_of_gift = row.contents[3].string

            #convert the date to a proper datetime object
            date_of_gift = datetime.strptime(date_as_listed, "%d-%b-%y")

            print "Found a gift for " + person_name
            data = {
                'person_name': person_name,
                'detail_of_gift': detail_of_gift,
                'donor_of_gift': donor_of_gift,
                'date_as_listed': date_as_listed
            }

            #save it to the datastore
            datastore.save(unique_keys=[
                'person_name', 'date_as_listed', 'detail_of_gift'
            ],
                           data=data,
                           date=date_of_gift)
def extract_table_data(pct_name,s,facility_type):
    """
    Extracts data from a list of PCT facilities
    """

    services = []
    d = {}
    for t in s.getchildren():
        if t.tag=="dt":
            if d != {}:
                services.append(d)
            d = {"PCT":pct_name, "type":"service"}
            u = t.find("a")
            if u != None:
                t = u
                d["info HTML"] = "http://www.nhs.uk" + t.attrib["href"]
            name = (t.text or "").strip()
            d["name"] = name
            print name
        elif t.text[:4]=="tel:":
            d["telephone"]=t.text[5:]
        else:
            address = t.text
            d["address"] = address
            postcode = geo.extract_gb_postcode(address)
            d["postcode"] = postcode
            d["latlng"] = geo.gb_postcode_to_latlng(postcode)
            
    for d in services:
        if "info HTML" in d:
            scrape_extra(d,facility_type)
        datastore.save(unique_keys=["PCT","type","name","address"], data=d)
Beispiel #17
0
def parse_page(page, url):
    for table in page.findAll('table', {'id': 'caselist'}):
        for row in table.findAll('tr')[1:]:
            if row['class'].find('last') < 0:
                cells = row.findAll('td')

                handed_down = cells[0].string
                neutral_citation = cells[1].string
                case_id = cells[2].string
                case_name = cells[3].contents[1].string
                court = ''

                # return absolute urls, they are WAY more useful.
                judgment_pdf_link = urlparse.urljoin(
                    url, cells[4].findAll('a', title=Jr)[0]['href'])
                press_summary_link = urlparse.urljoin(
                    url, cells[4].findAll('a', title=PSr)[0]['href'])

            #save to datastore
            data = {
                'case_name': case_name,
                'handed_down': handed_down,
                'case_id': case_id,
                'neutral_citation': neutral_citation,
                'judgment_pdf_link': judgment_pdf_link,
                'press_summary_link': press_summary_link
            }
            datastore.save(unique_keys=['case_id'], data=data)
def schoolscrape(serial,name):
    url = "http://www.leics.gov.uk/index/education/going_to_school/information_about_schools/schools_resultdetail.htm?DFES=" + serial + "&submit=Search"
    subpage = BeautifulSoup(scrape(url))
    print name

    keyvalues = {}

    def addkeyvaluepair(k,v):
        print k + ": " + v
        keyvalues[k] = v    
    
    addkeyvaluepair("schoolname",name)

    for t in subpage.findAll("td",headers=re.compile("..*")):
        attrib = t.get("headers")
        if attrib[:7] == "school_":
            attrib = attrib[7:]
        if attrib == "address":
            pc = postcode_format(str(t.contents[-1]))
            addkeyvaluepair("postcode",pc)
            t.contents = t.contents [:-2]
        addkeyvaluepair(attrib,tagcontents_to_string(t))
    print ""

    datastore.save(unique_keys=["schoolname"], data=keyvalues)
def details(extra, data):
    address = re.findall('(?si)<!-- BLOCK: PostalAddress -->\s*<strong>Write to me at:</strong><br />(.*?)<br /><br />\s*<!-- ENDBLOCK: PostalAddress -->', extra)
    if address:
        address = re.sub('\r|,', '', address[0])
        data["address"] = re.sub('\n', ' ', address)
    phone = re.findall('(?si)<!-- BLOCK: Telephone -->\s*<strong>Phone me on:</strong><br />(.*?)<br /><br />\s*<!-- ENDBLOCK: Telephone -->', extra)
    if phone:
        data["phone"] = phone[0]
    email = re.findall('(?si)<!-- BLOCK: EmailAddress -->\s*<strong>Email me at:</strong><br /><a href="mailto:(.*?)">.*?</a><br /><br />\s*<!-- ENDBLOCK: EmailAddress -->', extra)
    if email:
        data["email"] = email[0]
    website = re.findall('(?si)<!-- BLOCK: WebsiteAddress -->\s*<strong>Website address:</strong><br /><a href="(.*?)".*?>.*?</a><br /><br />\s*<!-- ENDBLOCK: WebsiteAddress -->', extra)
    if website:
        data["website"] = website[0]
    bio = re.findall('(?si)<!-- BLOCK: Biography -->\s*<div class="content_pod_content_title"><h1>.*?</h1></div>(.*?)<!-- ENDBLOCK: Biography -->', extra)
    if bio:
        data["bio"] = SimplifyHTML(bio[0])
        if re.search("Rory Palmer", data["bio"]):  # very bad formatting here
            data["bio"] = re.sub("(?s)^Rory Palmer.*?About Rory Palmer", "", data["bio"])
            data["bio"] = re.sub("==", "", data["bio"])
            data["bio"] = re.sub("\s*?\n\n\s*?", "\n\n", data["bio"]).strip()
        data["bio"] = re.sub("^Biographical Details\n\s*", "", data["bio"])
    photo = re.findall('(?si)<td valign="top" width="210"><img src="(.*?)" border="0" alt=".*?" width="200" class="" />', extra)
    if photo:
        data["photo"] = urlparse.urljoin(data["url"], photo[0])
    constituency = re.findall('(?si)</h6>\s*PPC for (.*?)<br />', extra)
    if constituency:
        data["constituency"] = RegularizeConstituency(constituency[0])
        if data["constituency"]:
            datastore.save(unique_keys=['name', 'constituency'], data=data)
        else:
            assert data["name"] == "Adam Leeder"
    else:
        print "No constituency found", data, extra
    print data
def parse_page(html, id): # parse LA specific page
    la_page = BeautifulSoup(html.read())
    eo_det = la_page.find('div',{'class':'yourOffice'})
    eo={}
    eo['id'] = id
    address = [a.strip() for a in str(eo_det.find('p')).strip().split('<br />')]
    address = address[1:-2]
    eo['address1'] = address[0]
    eo['address2'] = address[1]
    eo['address3'] = address[2]
    eo['address4'] = address[3]
    eo['postcode'] = address[4]
    try:
        eo['phone'] = address[5]
    except:
        pass
    # latlng = scraperwiki.geo.gb_postcode_to_latlng(eo['postcode']) # seems broke for now :[
    # print latlng
    h = eo_det.findAll('a')
    eo['local_authority'] = h[0].text
    eo['url'] = h[0]['href']
    if len(h) > 1:
        eo['email'] = re.match('^mailto:(.*)',h[1]['href']).groups()[0]
    # save
    datastore.save(unique_keys=['id'], data=eo)
    print eo
def do_year(y, url):
    pagetext = urllib2.urlopen(url)
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"),
                        tokenizer=sanitizer.HTMLSanitizer)
    page = parser.parse(pagetext)

    for section in page.findall(
            "body/div/div/div/div/div/div/div/div/table[@class='fixture']"):

        matchtype = section.find("caption").text

        for match in section.findall("tbody/tr"):

            l = list(match.getchildren())
            d = {}
            d["Match type"] = matchtype
            d["Match number"] = l[0].text
            d["Date"] = make_date(l[1].text, y)
            d["Team 1"] = flatten_refs(l[3])
            d["Team 2"] = flatten_refs(l[5])
            a = l[4].find("a")
            d["Score"] = a.text
            d["Report"] = "http://www.fifa.com" + a.get("href")
            print "%d (%s) %s - %s" % (y, d["Match type"], d["Team 1"],
                                       d["Team 2"])
            datastore.save(unique_keys=["Date", "Team 1", "Team 2"], data=d)
def main():

    url = "http://www.visitukheritage.gov.uk/servlet/com.eds.ir.cto.servlet.CtoLandDbQueryServlet?region=0&colflag=N"
    lines = iter(urlopen(url))

    postcode = re.compile("^\s*([A-Z][A-Z]?[0-9][0-9]?[A-Z]?)\s*([0-9][ABD-HJLNP-UW-Z][ABD-HJLNP-UW-Z])\s*$")
    
    keyvaluepairs = {}

    for l in lines:

        for l in lines:
            if re.search("<TR align=\"left\" Valign='top'>",l):
                keyvaluepairs = {}
                break
        else:
            break # Don't loop through if there's no more records

        # link and serial number
        l = lines.next()
        m = re.search("<A HREF='(.*)'>",l)
        link = "http://www.visitukheritage.gov.uk" + m.groups()[0]
        keyvaluepairs["Link"] = link
        m = re.search("<B>([0-9]*)</B>",l)
        serial = m.groups()[0]
        keyvaluepairs["Serial"] = serial
        print serial

        # location
        for l in lines:
            m = re.search("<TD>(.*)</TD>",l)
            if m:
                keyvaluepairs["Location"] = m.groups()[0]
                break

        # separate page
        datapage = "".join(urlopen(link)).replace("\n","")
        for m in re.finditer("<font face=\"Arial, Helvetica, sans-serif\" size=\"-1\">([^<]*)</font></b></td><td [^>]*align=\"left\">([^<]*)</td",datapage):
            k = m.groups()[0].strip().strip(":")
            v = m.groups()[1].replace("<br>","\n").strip()
            if v != "":
                keyvaluepairs[k] = v
        
        ### doesn't get links

        # tidy up the address
        if "Contact Address" in keyvaluepairs:
            raw_address = [x.strip() for x in keyvaluepairs["Contact Address"].split(",")]
            # separate off a phone number
            if len(raw_address)>0 and re.match("[ 0-9]*",raw_address[-1]):
                keyvaluepairs["Contact Telephone Number"] = raw_address[-1]
                raw_address = raw_address[:-1]
            if len(raw_address)>0 and re.match(postcode,raw_address[-1]):
                keyvaluepairs["Contact Postcode"] = raw_address[-1]
                raw_address = raw_address[:-1]
            keyvaluepairs["Contact Address"] = ", ".join(raw_address)                
                
        # now save it
        datastore.save(unique_keys=["Serial"],data=keyvaluepairs)
def scrape_pct(link, pct_name):
    """
    Scrapes the data associated with the PCT, and calls functions to scrape
    data associated with the services.
    """

    print
    print
    print pct_name
    print "-" * len(pct_name)

    url = "http://www.nhs.uk" + link
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(scrape(url))
    root = page.getroot()

    d = {}

    # basic contact details
    d["PCT"] = pct_name
    d["type"] = "main"
    d["name"] = pct_name
    address = root.find("body/div/form/div/div/p").text
    d["address"] = address
    postcode = geo.extract_gb_postcode(address)
    d["postcode"] = postcode
    d["latlng"] = geo.gb_postcode_to_latlng(postcode)
    d["info HTML"] = url

    # quality
    for t in root.findall(
            "body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']"
    ):
        k = t.find("div/h4").text.strip()
        v = t.find("div/img").attrib["alt"]
        d[k] = v

    # head honcho
    for t in root.findall(
            "body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']"
    ):
        d["Boss"] = t.text.replace("<br />", ", ")

    # boring text
    for t in root.findall("body/div/form/div/div/div/div/div/div/p"):
        if t.text:
            if t.attrib.get("class", False) == "intro":
                d["intro text"] = t.text
            else:
                d["boilerplate"] = d.get("boilerplate", "") + "\n" + t.text

    datastore.save(unique_keys=["PCT", "type", "name", "address"],
                   data=d,
                   latlng=d.get("latlng"))

    scrape_facilities(pct_name, root)
    scrape_others(pct_name, url)
Beispiel #24
0
def schoolscrape(categoryurl, name, url):

    print ""
    print name

    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(specialscrape(url))

    # pre = "{http://www.w3.org/1999/xhtml}"
    pre = ""

    keyvaluepairs = {}

    def addkeyvaluepair(k, v):
        keyvaluepairs[k] = v
        print k + ": " + v

    data_rows = [
        t
        for t in page.findall(path(["body", "div", "div", "div", "div"], pre))
        if t.attrib.get("class", "") == "detailsRow"
    ]

    for row in data_rows:
        key = [
            t for t in row.findall(path(["span"], pre))
            if t.attrib.get("class", "") == "leftColumn"
        ][0].text.rstrip(": ")
        valuetag = [
            t for t in row.findall(path(["span"], pre))
            if t.attrib.get("class", "") == "rightColumn"
        ][0]
        if valuetag.text:
            if key == "Address":
                raw_address = [valuetag.text] + [
                    br.tail for br in valuetag.findall(path(["br"], pre))
                ]
                addkeyvaluepair("Address", " / ".join(raw_address[:-1]))
                addkeyvaluepair("Postcode", raw_address[-1])
            else:
                addkeyvaluepair(key, valuetag.text)
        else:
            links = valuetag.findall(path(["a"], pre))
            if len(links) == 1:
                addkeyvaluepair(key, links[0].attrib["href"])
            else:
                for link in links:
                    href = link.attrib["href"]
                    if href[:7] != "http://":
                        href = categoryurl + "details/" + href
                    addkeyvaluepair(link.text, href)

    datastore.save(unique_keys=["Name"], data=keyvaluepairs)
def scrapeschool(url):
    page = BeautifulSoup(scrape(url))
    schoolname = str(page.find("h2").contents[0])
    print ""
    print schoolname

    keyvalues = {}

    def addkeyvaluepair(k,v):
        print k + ": " + v
        keyvalues[sanitise(k)] = v
    
    def sanitise(s):
        return s.replace("(","").replace(")","").replace("'","").replace(">","")

    addkeyvaluepair("Schoolname",schoolname)
    
    # Some general key/value pairs
    for heading in page.findAll("th",style="width:30%;text-align:left;"):
        data = heading.findNextSibling("td",style="width:70%;text-align:left;")
        addkeyvaluepair(str(heading.contents[0]).rstrip(":"),str("".join([str(x) for x in data.contents])))

    # Some other general key/value pairs
    for tablebit in page.findAll("td", {"class":"tbltext", "style":"width:40%;text-align:left;"}):
        while tablebit.br:
            tablebit.br.extract()
        for heading in tablebit.findAll("strong"):
            body = heading.nextSibling
            try:
                body = body.get("href")
            except AttributeError:
                pass
            addkeyvaluepair(str(heading.contents[0]).rstrip(": "),str(body).rstrip(" \n\r"))
            
    # Address and postcode
    for addressbit in page.findAll("td", {"style":"width:60%;vertical-align:top;", "class":"tbltext"}):
        for link in addressbit.findAll("a"):
            addkeyvaluepair(link.contents[0],link.get("href"))
        text = [str(x).rstrip("\r\n ,").replace("&nbsp;","") for x in addressbit.contents if isinstance(x,NavigableString)]
        fulladdresstext = [x for x in text if x != ""]
        addkeyvaluepair("Address"," / ".join(fulladdresstext[:-1]))
        addkeyvaluepair("Postcode",fulladdresstext[-1])

    # School dinner menu link
    for arrow in page.findAll("img",{"src":"arrow.gif","width":"5","height":"5","alt":" "}):
        link = arrow.findNextSibling("a")
        addkeyvaluepair(link.contents[0],"http://www.nottinghamshire.gov.uk/" + link.get("href"))

    # Linked schools
    for linkedschools in page.findAll("td",{"style":"width:70%;text-align:left;vertical-align:top;"}):
        addkeyvaluepair("Linked Schools","; ".join([link.contents[0] for link in linkedschools.findAll("a")]))

    datastore.save(unique_keys=["Schoolname"], data=keyvalues)
def details(extra, data):
    address = re.findall(
        '(?si)<!-- BLOCK: PostalAddress -->\s*<strong>Write to me at:</strong><br />(.*?)<br /><br />\s*<!-- ENDBLOCK: PostalAddress -->',
        extra)
    if address:
        address = re.sub('\r|,', '', address[0])
        data["address"] = re.sub('\n', ' ', address)
    phone = re.findall(
        '(?si)<!-- BLOCK: Telephone -->\s*<strong>Phone me on:</strong><br />(.*?)<br /><br />\s*<!-- ENDBLOCK: Telephone -->',
        extra)
    if phone:
        data["phone"] = phone[0]
    email = re.findall(
        '(?si)<!-- BLOCK: EmailAddress -->\s*<strong>Email me at:</strong><br /><a href="mailto:(.*?)">.*?</a><br /><br />\s*<!-- ENDBLOCK: EmailAddress -->',
        extra)
    if email:
        data["email"] = email[0]
    website = re.findall(
        '(?si)<!-- BLOCK: WebsiteAddress -->\s*<strong>Website address:</strong><br /><a href="(.*?)".*?>.*?</a><br /><br />\s*<!-- ENDBLOCK: WebsiteAddress -->',
        extra)
    if website:
        data["website"] = website[0]
    bio = re.findall(
        '(?si)<!-- BLOCK: Biography -->\s*<div class="content_pod_content_title"><h1>.*?</h1></div>(.*?)<!-- ENDBLOCK: Biography -->',
        extra)
    if bio:
        data["bio"] = SimplifyHTML(bio[0])
        if re.search("Rory Palmer", data["bio"]):  # very bad formatting here
            data["bio"] = re.sub("(?s)^Rory Palmer.*?About Rory Palmer", "",
                                 data["bio"])
            data["bio"] = re.sub("==", "", data["bio"])
            data["bio"] = re.sub("\s*?\n\n\s*?", "\n\n", data["bio"]).strip()
        data["bio"] = re.sub("^Biographical Details\n\s*", "", data["bio"])
    photo = re.findall(
        '(?si)<td valign="top" width="210"><img src="(.*?)" border="0" alt=".*?" width="200" class="" />',
        extra)
    if photo:
        data["photo"] = urlparse.urljoin(data["url"], photo[0])
    # for MPs
    mconstituency = re.search(
        '(?si)</h6>\s*(?:MP for (.*?)<br />)?\s*(?:PPC for (.*?)<br />)?',
        extra)
    if not mconstituency:
        print "---", extra
        return
    if mconstituency.group(1):
        data["MP for"] = mconstituency.group(1)
    if mconstituency.group(2):
        data["constituency"] = RegularizeConstituency(mconstituency.group(2))
        datastore.save(unique_keys=['constituency'], data=data)
    else:
        print "MPonly  ", data
def scrape_school(lea_name, lea_number, urlfrag):
    data = {"LEA name": lea_name, "LEA number": lea_number}

    url = "http://www.education.gov.uk" + urlfrag
    page = html.parse(url)

    # school name
    headerpath = "/".join(["body", "div", "div", "h1"])
    name = page.find(headerpath).text
    print " * %s" % name
    data["School name"] = name

    # contact data, etc
    attribpath = "/".join(["body", "div", "div", "div", "div", "dl"])
    for attriblist in page.findall(attribpath):
        for (title, entries) in description(attriblist):
            titletext = title.text.rstrip(":")
            if titletext[-24:] == " (click for explanation)":
                titletext = titletext[:-24]

            entrytexts = []
            for entry in entries:
                link = entry.find("a")
                if (link is not None) and (link.attrib.get("class", "")
                                           == "acronym") and ("title"
                                                              in link.attrib):
                    entrytexts.append(link.attrib["title"])
                else:
                    entrytexts.append(
                        unmarkup(entry).strip(" \n").replace("\n", "; "))
            entrytext = ", ".join(entrytexts)

            data[titletext] = entrytext
            if report:
                print "    - %s: %s" % (titletext, entrytext)

    # main data
    listpath = "/".join(["body", "div", "div", "div", "div", "div", "dl"])
    for datalist in page.findall(listpath):
        if "class" in datalist.attrib and datalist.attrib[
                "class"] == "schoolsstatslist":
            for (title, entry) in zip(datalist.findall("dt"),
                                      datalist.findall("dd")):
                titletext = title.text.strip()

                entrytext = unmarkup(entry).strip()

                data[titletext] = entrytext
                if report:
                    print "    - %s: %s" % (titletext, entrytext)

    datastore.save(data=data, unique_keys=["LEA name", "School name"])
def scrape_candidate_details(href):
    """Gets the details about each candidate"""
    data = {}
    html = scraperwiki.scrape(base_url % href.replace("amp;", ""))
    page = BeautifulSoup.BeautifulSoup(html)
    #The heading contains the name
    heading = page.find('div', {'id': 'divHeading'})
    data['name'] = heading.text.split(' &ndash;')[0]
    constituency = page.find('div', {
        'id': 'divConstituencyContactInfo'
    }).findAll('a')
    try:
        data['constituency'] = constituency[0].text
    except IndexError:
        constituency = page.find('div', {'id': 'divIntroduction'}).findAll('a')
        to_save = ""
        for link in constituency:
            if link.text != "":
                to_save = link.text
        data['constituency'] = to_save
    #Each candidate has AboutMe section.
    about = page.find('div', {'id': 'divAboutMe'})
    for table in about.findAll('table'):
        for row in table.findAll('tr'):
            data[row.find('th').text.replace(':', '')] = row.find('td').text
    #Extracts the candidates bio
    bio = page.find('div', {'id': 'divBiography'})
    bio_text = []
    for para in bio.findAll('p'):
        bio_text.append(para.text)
    data['bio'] = "\n".join(bio_text)
    #Get the contact info for each candidate
    contact = page.find('div', {'id': 'divIndividualContactInfo'})
    for address in contact.findAll('ul', 'address'):
        to_store = []
        for line in address.findAll('li'):
            to_store.append(line.text)
        data['address'] = ', '.join(to_store)
    links = contact.findAll('a')
    if len(links) > 0:
        if len(links) == 2:
            data['email'] = links[0]['href'].replace('mailto:', '')
            data['website'] = links[1]['href']
        else:
            data['email'] = links[0]['href'].replace('mailto:', '')
    #Use re to get telephone number
    m = re.search("<strong>Telephone:</strong>(.*)<br /><strong>",
                  str(contact))
    if m is not None:
        data['telephone'] = m.group(1)
    datastore.save(unique_keys=['constituency'], data=data)
def scrape_pct(link,pct_name):
    """
    Scrapes the data associated with the PCT, and calls functions to scrape
    data associated with the services.
    """
    
    print
    print
    print pct_name
    print "-"*len(pct_name)

    url = "http://www.nhs.uk" + link
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(scrape(url))
    root = page.getroot()

    d = {}

    # basic contact details
    d["PCT"] = pct_name
    d["type"] = "main"
    d["name"] = pct_name
    address = root.find("body/div/form/div/div/p").text
    d["address"] = address
    postcode = geo.extract_gb_postcode(address)
    d["postcode"] = postcode
    d["latlng"] = geo.gb_postcode_to_latlng(postcode)
    d["info HTML"] = url

    # quality
    for t in root.findall("body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']"):
        k = t.find("div/h4").text.strip()
        v = t.find("div/img").attrib["alt"]
        d[k] = v

    # head honcho
    for t in root.findall("body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']"):
        d["Boss"] = t.text.replace("<br />",", ")

    # boring text
    for t in root.findall("body/div/form/div/div/div/div/div/div/p"):
        if t.text:
            if t.attrib.get("class",False)=="intro":
                d["intro text"] = t.text
            else:
                d["boilerplate"] = d.get("boilerplate","")+"\n"+t.text

    datastore.save(unique_keys=["PCT","type","name","address"], data=d, latlng=d.get("latlng"))

    scrape_facilities(pct_name,root)
    scrape_others(pct_name,url)
def ScrapeWard(ward_id, year):
    url_format_string = "http://breathingspace.sefton.gov.uk/Default.aspx?bsPage=road_safety&option=4&step=2&WardId={0}&StartMonth=1&StartYear={1}&EndMonth=12&EndYear={1}"
    url = str.format(url_format_string, ward_id, year)
    html = scraperwiki.scrape(url)
    page = BeautifulSoup.BeautifulSoup(html)
    table = page.findAll('table', {'class': 'Grid'})[1]
    for row in table.findAll('tr')[1:]:
        cells = row.findAll('td')
        time = ExtractTime(cells[0].string, cells[1].string)
        location_description = cells[2].string
        latlng = ConvertLocationToLatLng(cells[3].string)
        details_url = 'http://breathingspace.sefton.gov.uk/' + cells[4].find('a')['href']
        data = {"date": time, "location_description": location_description, "url": details_url }
        data.update(ScrapeAccidentDetails(details_url))        
        datastore.save(unique_keys = ['date', 'location_description'], latlng = latlng, data = data)
def extractMonthlyData(d):
    print "Date: " + d
    
    url = "http://www.tax.state.ak.us/programs/oil/production/ans.aspx?" + d

    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(urlopen(url))
        
    for r in page.findall("body/form/div/div/div/div/table/tbody/tr"):
        l = list(c.text for c in r.findall("td"))
        d = processDate(l[0])
        if d:
            l[0] = d
            data = dict(zip(fields,l))
            datastore.save(unique_keys=["Date"], data=data)
Beispiel #32
0
def extractMonthlyData(d):
    print "Date: " + d

    url = "http://www.tax.state.ak.us/programs/oil/production/ans.aspx?" + d

    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(urlopen(url))

    for r in page.findall("body/form/div/div/div/div/table/tbody/tr"):
        l = list(c.text for c in r.findall("td"))
        d = processDate(l[0])
        if d:
            l[0] = d
            data = dict(zip(fields, l))
            datastore.save(unique_keys=["Date"], data=data)
def scrape_candidate_details(href):
    """Gets the details about each candidate"""
    data = {}
    html = scraperwiki.scrape(base_url % href.replace("amp;",""))
    page = BeautifulSoup.BeautifulSoup(html)
    #The heading contains the name
    heading = page.find('div', {'id': 'divHeading'})
    data['name'] = heading.text.split(' &ndash;')[0]
    constituency = page.find('div', {'id': 'divConstituencyContactInfo'}).findAll('a')
    try:
        data['constituency'] = constituency[0].text
    except IndexError:
        constituency = page.find('div', {'id': 'divIntroduction'}).findAll('a')
        to_save = ""
        for link in constituency:
            if link.text != "":
                to_save = link.text
        data['constituency'] = to_save
    #Each candidate has AboutMe section.
    about = page.find('div', {'id': 'divAboutMe'})
    for table in about.findAll('table'):
        for row in table.findAll('tr'):
            data[row.find('th').text.replace(':', '')] = row.find('td').text
    #Extracts the candidates bio
    bio = page.find('div', {'id':'divBiography'})
    bio_text=[]
    for para in bio.findAll('p'):
        bio_text.append(para.text)
    data['bio'] = "\n".join(bio_text)
    #Get the contact info for each candidate
    contact = page.find('div', {'id':'divIndividualContactInfo'})
    for address in contact.findAll('ul', 'address'):
        to_store = []
        for line in address.findAll('li'):
            to_store.append(line.text)
        data['address'] = ', '.join(to_store)
    links = contact.findAll('a')
    if len(links) > 0:
        if len(links) == 2:
            data['email'] = links[0]['href'].replace('mailto:', '')
            data['website'] = links[1]['href']
        else:
            data['email'] = links[0]['href'].replace('mailto:', '')
    #Use re to get telephone number        
    m = re.search("<strong>Telephone:</strong>(.*)<br /><strong>", str(contact))   
    if m is not None:
        data['telephone'] = m.group(1) 
    datastore.save (unique_keys = ['constituency'], data = data)      
def parse_page(page):


    wrapper = page.find('div', {'id': 'print_div1'})
    for row in wrapper.findAll('tr')[1:]:
        cells = row.findAll('td')
        title = cells[0].contents[0].string
        country = cells[1].string
        funding_type = cells[2].string
        stage = cells[3].string
        start_date = datetime.strptime(cells[4].string, "%d/%m/%Y")
        total_budget = cells[5].string.replace(',', '')
    
    
        data = { 'title' : title, 'country' : country, 'funding_type': funding_type, 'stage': stage, 'total_budget': total_budget, 'start_date': start_date}
        datastore.save(unique_keys=['title','country', 'total_budget', 'start_date'], data=data, date=start_date)
def scrape_school(lea_name, lea_number, urlfrag):
    data = {"LEA name":lea_name, "LEA number":lea_number}
    
    url = "http://www.education.gov.uk" + urlfrag
    page = html.parse(url)

    # school name
    headerpath = "/".join(["body","div","div","h1"])
    name = page.find(headerpath).text
    print " * %s"%name
    data["School name"]=name

    # contact data, etc
    attribpath = "/".join(["body","div","div","div","div","dl"])
    for attriblist in page.findall(attribpath):
        for (title,entries) in description(attriblist):
            titletext = title.text.rstrip(":")  
            if titletext[-24:] == " (click for explanation)":
                titletext = titletext[:-24]

            entrytexts = []
            for entry in entries:
                link = entry.find("a")
                if (link is not None) and (link.attrib.get("class","") == "acronym") and ("title" in link.attrib):
                    entrytexts.append(link.attrib["title"])
                else:
                    entrytexts.append(unmarkup(entry).strip(" \n").replace("\n","; "))
            entrytext = ", ".join(entrytexts)

            data[titletext] = entrytext
            if report:
                print "    - %s: %s"%(titletext,entrytext)
            
    # main data
    listpath = "/".join(["body","div","div","div","div","div","dl"])
    for datalist in page.findall(listpath):
        if "class" in datalist.attrib and datalist.attrib["class"] == "schoolsstatslist":
            for (title,entry) in zip(datalist.findall("dt"),datalist.findall("dd")):
                titletext = title.text.strip()

                entrytext = unmarkup(entry).strip()

                data[titletext] = entrytext
                if report:
                    print "    - %s: %s"%(titletext,entrytext)

    datastore.save(data=data, unique_keys=["LEA name","School name"])
def scrapepage(url):
    html = scraperwiki.scrape(url)
    page = fromstring(html)
    print page
    datevalue = CSSSelector('select#period option[selected]')(page)[0]['value']
    print datevalue
    
    for row in CSSSelector('table.datagrid tbody tr')(page):
        columns = CSSSelector('td')(row)
        data = {'channel': columns[0].text,
            'dailyreach': cleanint(columns[1].text) * 1000,
            'dailyreach_percent': cleanfloat(columns[2].text),
            'weeklyreach': cleanint(columns[3].text) * 1000,
            'weeklyreach_percent': cleanfloat(columns[4].text),
            'weeklyviewing': cleantime(columns[5].text),
            'share': cleanfloat(columns[6].text)}
        datastore.save(unique_keys=['channel'], data=data)
def convertDate(value):
    m = reDM.match(value)
    if m:
        return '2010-%02d-%02d' % (convertMonth(m.group(2)), int(m.group(1)))
    else:
        return value


#def Main():
    url = "http://www.london2012.com/games/olympic-sports/"
    br = mechanize.Browser()
    br.set_handle_robots(False)
    base = br.open(url)
    page = base.read()
    area = re.findall(
        '(?si)<span class="selected">Olympic sports</span><ul>(.*?)</ul>',
        page)
    events = re.findall('(?si)<li>(.*?)</li>', area[0])
    for event in events:
        data = {}
        sport = re.findall('(?si)<a href=".*?">(.*?)\s\-\s.*?</a>', event)
        if sport:
            data["sport"] = sport[0]
        else:
            sport = re.findall('(?si)<a href=".*?">(.*?)</a>', event)
            if sport:
                sport = sport[0].replace("Canoe Slalom", "Canoe").replace(
                    "Canoe Sprint", "Canoe")
                data["sport"] = sport
        category = re.findall('(?si)<a href=".*?">\w*?\s\-\s(.*?)</a>', event)
        if category:
            data["category"] = category[0]
        else:
            category = re.findall('(?si)<a href=".*?">Canoe (.*?)</a>', event)
            if category:
                data["category"] = category[0]
        link = re.findall('(?si)<a href="(.*?)">.*?</a>', event)
        details = br.follow_link(url_regex=link[0])
        getDetails(details.read(), data)
        br.back()
        link = urlparse.urljoin("http://www.london2012.com/", link[0])
        data["link"] = link
        datastore.save(unique_keys=['sport', 'link'], data=data)
        print data
        print "--------------------------------------------------------------------"
def parse_orgs(institution_list):
    
    ins = institution_list.findAll('tr', {'class':'tHigh', 'class':'tLow', })
    
    cls_map = {'dc2':'institution', 'dc4':'current_grants', 'dc5':'announced_grants_total', }
    # loop through all rows
    for i in ins: 
        institution = {}
        link = i.find('a', {'class':'noUndStd'})
        institution['stfc_url'] = base_url + link['href']
        institution['id'] = re.match('.*in=(-?\d+)$',institution['stfc_url']).group(1)
        print institution['id'] 
        
        for cell_cls, name in cls_map.iteritems():
            institution[name] = i.find('td', {'class':cell_cls}).text.strip()
            
        institution['announced_grants_total']  = int(institution['announced_grants_total'].replace(',',''))
        datastore.save(unique_keys=['id'], data=institution)
        print institution
Beispiel #39
0
def Main():
    url = "http://www.snp.org/people/candidates/Westminster"

    br = mechanize.Browser()
    br.set_handle_robots(False)
    base = br.open(url)
    page = base.read()
    #print page
    candidates = re.findall(
        '(?si)<div class=\'view-content view-content-people-candidates\'><div class="item-list"><ul>(.*?)</ul></div></div>',
        page)
    links = re.findall('(?si)<li>(.*?)</li>', candidates[0])
    for i, link in enumerate(links):
        data = {}
        constituency = re.findall('(?si)<a href=".*?">(.*?):.*?</a>', link)
        data["constituency"] = RegularizeConstituency(constituency[0])
        name = re.findall('(?si)<a href=".*?">.*?:\s*(.*?)</a>', link)
        data["name"] = name[0]
        ppc_link = re.findall('(?si)<a href="(.*?)">.*?:\s*.*?</a>', link)
        llink = ppc_link[0]
        if llink == "//stewarthosie":
            llink = "/stewarthosie"
        data["url"] = urlparse.urljoin(url, llink)
        black_list = [
            "/people/midlothian-colin-beattie",
            "/people/moray-angus-robertson",
            "/people/motherwell-wishaw-marion-fellows",
            "/people/ochil-south-perthshire",
            "/people/orkney-shetland-john-mowat"
        ]
        print i, data["url"]
        if ppc_link[0] not in black_list:
            #extra = br.follow_link(url_regex=ppc_link[0])
            try:
                extra = urllib2.urlopen(data["url"])
                Details(extra.read(), data)
            except urllib2.HTTPError as e:
                print e

            #br.back()
        #print "DATA: ", data
        datastore.save(unique_keys=['constituency'], data=data)
def main():
    w = xlrd.open_workbook(file_contents=scrape("http://uk.sitestat.com/lincolnshire/lincolnshire/s?Home.A_Parent.School_Admissions.All_About_Your_Local_Schools.A__Z_List_of_Schools.AZ_List_of_Schools.xls&ns_type=pdf&ns_url=http://www.lincolnshire.gov.uk/upload/public/attachments/1172/AZ_List_of_Schools.xls"))
    s = w.sheet_by_index(0)

    keys = [str(c.value) for c in s.row(0)]
    schoolname = keys[1]

    for i in range(1,s.nrows):
        r = s.row(i)
        if sum([len(c.value) for c in r[1:]]) == 0:
            # want to test that all the rows are empty, but the tests don't work
            # this is just an extra heading row; we don't need it
            pass
        else:
            keyvalues = {}
            for (k,c) in zip(keys,r):
                v = str(c.value.replace(u'\u2019',"'"))
                if v != "":
                    keyvalues[k] = v
            datastore.save(unique_keys=[schoolname],data=keyvalues)
def schoolscrape(categoryurl,name,url):

    print ""
    print name

    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(specialscrape(url))
    
    # pre = "{http://www.w3.org/1999/xhtml}"
    pre = ""
    
    keyvaluepairs = {}

    def addkeyvaluepair(k,v):
        keyvaluepairs[k] = v
        print k + ": " + v

    data_rows = [t for t in page.findall(path(["body","div","div","div","div"],pre)) if t.attrib.get("class","") == "detailsRow"]

    for row in data_rows:
        key = [t for t in row.findall(path(["span"],pre)) if t.attrib.get("class","") == "leftColumn"][0].text.rstrip(": ")
        valuetag = [t for t in row.findall(path(["span"],pre)) if t.attrib.get("class","") == "rightColumn"][0]
        if valuetag.text:
            if key == "Address":
                raw_address = [valuetag.text] + [br.tail for br in valuetag.findall(path(["br"],pre))]
                addkeyvaluepair("Address"," / ".join(raw_address[:-1]))
                addkeyvaluepair("Postcode",raw_address[-1])
            else:
                addkeyvaluepair(key,valuetag.text)
        else:
            links = valuetag.findall(path(["a"],pre))
            if len(links) == 1:
                addkeyvaluepair(key,links[0].attrib["href"])
            else:
                for link in links:
                    href = link.attrib["href"]
                    if href[:7] != "http://":
                        href = categoryurl + "details/" + href
                    addkeyvaluepair(link.text,href)
                    
    datastore.save(unique_keys=["Name"], data=keyvaluepairs)
def convertDate(value):
    m = reDM.match(value)
    if m:
        return '2010-%02d-%02d' % (convertMonth(m.group(2)), int(m.group(1)))
    else:
        return value

#def Main():
    url = "http://www.london2012.com/games/olympic-sports/"
    br = mechanize.Browser()
    br.set_handle_robots(False)
    base = br.open(url)
    page = base.read()
    area = re.findall('(?si)<span class="selected">Olympic sports</span><ul>(.*?)</ul>', page)
    events = re.findall('(?si)<li>(.*?)</li>', area[0])
    for event in events:
        data = {}
        sport = re.findall('(?si)<a href=".*?">(.*?)\s\-\s.*?</a>', event)
        if sport:
            data["sport"] = sport[0]
        else:
            sport = re.findall('(?si)<a href=".*?">(.*?)</a>', event)
            if sport:
                sport = sport[0].replace("Canoe Slalom", "Canoe").replace("Canoe Sprint", "Canoe")
                data["sport"] = sport
        category = re.findall('(?si)<a href=".*?">\w*?\s\-\s(.*?)</a>', event)
        if category:
            data["category"] = category[0]
        else:
            category = re.findall('(?si)<a href=".*?">Canoe (.*?)</a>', event)
            if category:
                data["category"] = category[0]
        link = re.findall('(?si)<a href="(.*?)">.*?</a>', event)
        details = br.follow_link(url_regex=link[0])
        getDetails(details.read(), data)
        br.back()
        link = urlparse.urljoin("http://www.london2012.com/", link[0])
        data["link"] = link
        datastore.save(unique_keys=['sport', 'link'], data=data)
        print data
        print "--------------------------------------------------------------------"
def parse_page(page):        

    #find each row on this page
    for table in page.findAll('table', {'class': 't18Standard'}):
        for row in table.findAll('tr')[1:]: 

            #strip out the details of each gift
            person_name = row.contents[0].string
            date_as_listed = row.contents[1].string
            detail_of_gift = row.contents[2].string
            donor_of_gift = row.contents[3].string

            #convert the date to a proper datetime object
            date_of_gift = datetime.strptime(date_as_listed, "%d-%b-%y")
            
            print "Found a gift for " + person_name
            data = {'person_name': person_name, 'detail_of_gift': detail_of_gift, 'donor_of_gift': donor_of_gift, 'date_as_listed': date_as_listed}
            
        
            #save it to the datastore
            datastore.save(unique_keys = ['person_name', 'date_as_listed', 'detail_of_gift'], data = data, date=date_of_gift)
Beispiel #44
0
def ScrapeWard(ward_id, year):
    url_format_string = "http://breathingspace.sefton.gov.uk/Default.aspx?bsPage=road_safety&option=4&step=2&WardId={0}&StartMonth=1&StartYear={1}&EndMonth=12&EndYear={1}"
    url = str.format(url_format_string, ward_id, year)
    html = scraperwiki.scrape(url)
    page = BeautifulSoup.BeautifulSoup(html)
    table = page.findAll('table', {'class': 'Grid'})[1]
    for row in table.findAll('tr')[1:]:
        cells = row.findAll('td')
        time = ExtractTime(cells[0].string, cells[1].string)
        location_description = cells[2].string
        latlng = ConvertLocationToLatLng(cells[3].string)
        details_url = 'http://breathingspace.sefton.gov.uk/' + cells[4].find(
            'a')['href']
        data = {
            "date": time,
            "location_description": location_description,
            "url": details_url
        }
        data.update(ScrapeAccidentDetails(details_url))
        datastore.save(unique_keys=['date', 'location_description'],
                       latlng=latlng,
                       data=data)
Beispiel #45
0
def main():
    w = xlrd.open_workbook(file_contents=scrape(
        "http://uk.sitestat.com/lincolnshire/lincolnshire/s?Home.A_Parent.School_Admissions.All_About_Your_Local_Schools.A__Z_List_of_Schools.AZ_List_of_Schools.xls&ns_type=pdf&ns_url=http://www.lincolnshire.gov.uk/upload/public/attachments/1172/AZ_List_of_Schools.xls"
    ))
    s = w.sheet_by_index(0)

    keys = [str(c.value) for c in s.row(0)]
    schoolname = keys[1]

    for i in range(1, s.nrows):
        r = s.row(i)
        if sum([len(c.value) for c in r[1:]]) == 0:
            # want to test that all the rows are empty, but the tests don't work
            # this is just an extra heading row; we don't need it
            pass
        else:
            keyvalues = {}
            for (k, c) in zip(keys, r):
                v = str(c.value.replace(u'\u2019', "'"))
                if v != "":
                    keyvalues[k] = v
            datastore.save(unique_keys=[schoolname], data=keyvalues)
def Main():
    url = "http://www.snp.org/people/candidates/Westminster"

    br = mechanize.Browser()
    br.set_handle_robots(False)
    base = br.open(url)
    page = base.read()
    #print page
    candidates = re.findall('(?si)<div class=\'view-content view-content-people-candidates\'><div class="item-list"><ul>(.*?)</ul></div></div>', page)
    links = re.findall('(?si)<li>(.*?)</li>', candidates[0])
    i = 0
    for link in links[:]:
        data = {}
        constituency = re.findall('(?si)<a href=".*?">(.*?):.*?</a>', link)
        data["constituency"] = RegularizeConstituency(constituency[0])
        name = re.findall('(?si)<a href=".*?">.*?:\s*(.*?)</a>', link)
        data["name"] = name[0]
        ppc_link = re.findall('(?si)<a href="(.*?)">.*?:\s*.*?</a>', link)
        llink = ppc_link[0]
        if llink == "//stewarthosie":
            llink = "/stewarthosie"
        data["url"] = urlparse.urljoin(url, llink)
        black_list = ["/people/midlothian-colin-beattie", "/people/moray-angus-robertson", 
                      "/people/motherwell-wishaw-marion-fellows", "/people/ochil-south-perthshire", 
                      "/people/orkney-shetland-john-mowat"]
        print i, data["url"]
        if ppc_link[0] not in black_list:
            #extra = br.follow_link(url_regex=ppc_link[0])
            try:
                extra = urllib2.urlopen(data["url"])
                Details(extra.read(), data)
            except urllib2.HTTPError as e:
                print e
                
            #br.back()
        #print "DATA: ", data
        datastore.save(unique_keys=['name', 'constituency'], data=data)
        i += 1
Beispiel #47
0
def process():
    for url, offset in sources:
        book = xlrd.open_workbook(file_contents=scrape(url))
        sheet = book.sheets()[0]

        for row in range(0, sheet.nrows):
            for column in range(0, sheet.ncols):
                cell = sheet.cell(row, column)
                yearRange = getYearRange(cell)
                if yearRange:
                    rowCursor = row
                    while True:
                        rowCursor += 1
                        startIncome, endIncome = getIncomeRange(
                            sheet.cell(rowCursor, column))
                        data = {
                            'url':
                            url,
                            'incomeCoordinate':
                            getCoordinate(rowCursor, column),
                            'taxCoordinate':
                            getCoordinate(rowCursor, column + offset),
                            'yearRange':
                            yearRange,
                            'startIncome':
                            startIncome,
                            'endIncome':
                            endIncome,
                            'taxRate':
                            sheet.cell(rowCursor, column + offset).value
                        }
                        if startIncome or endIncome:
                            print data
                            datastore.save(
                                ['url', 'incomeCoordinate', 'taxCoordinate'],
                                data)
                        if startIncome and not endIncome:
                            break
def do_year(y,url):
    pagetext = urllib2.urlopen(url)
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"), tokenizer=sanitizer.HTMLSanitizer)
    page = parser.parse(pagetext)

    for section in page.findall("body/div/div/div/div/div/div/div/div/table[@class='fixture']"):

        matchtype = section.find("caption").text

        for match in section.findall("tbody/tr"):

            l = list(match.getchildren())
            d = {}
            d["Match type"] = matchtype
            d["Match number"] = l[0].text
            d["Date"] = make_date(l[1].text, y)
            d["Team 1"] = flatten_refs(l[3])
            d["Team 2"] = flatten_refs(l[5])
            a = l[4].find("a")
            d["Score"] = a.text
            d["Report"] = "http://www.fifa.com" + a.get("href")
            print "%d (%s) %s - %s"%(y,d["Match type"],d["Team 1"],d["Team 2"])
            datastore.save(unique_keys = ["Date","Team 1","Team 2"], data=d)
def scrape_constituency(seat, url):    
    html = scraperwiki.scrape(url)
    page = BeautifulSoup.BeautifulSoup(html)
    # there's all sorts of stuff on this page.  I couldn't find
    # a value for the total electorate, although it might be here.
    # There is a turnout line, with a percentage value, from which
    # one could back-compute the electorate.  I don't do that yet. 
    table = page.find('table', attrs={'class': 'candidate-detail'})
    for candidate_row in table.tbody.findAll('tr'):
        print candidate_row
        items = candidate_row.findAll('td')
        party_class = candidate_row['class']
        # unlike the rest of the scrape, here we do hard-coded indexes.
        name = items[0].span.string.strip()
        party = items[1].string.strip()
        votes_string = items[2].string.replace(',','')
        try:
            votes = int(votes_string)
        except:
            votes = None
        data = {'seat': seat, 'candidate': name, 'party': party, 'votes': votes}
        datastore.save(unique_keys=['seat', 'candidate', 'party'], data=data)
    datastore.save(unique_keys=['seat'],data={'seat':seat, 'done':True})
def categoryscrape(url):     
    print "Ripping " + url
    print ""

    page = BeautifulSoup(scrape(url))
    for nametag in page.findAll("h3"):

        keyvalues = {}

        def addkeyvaluepair(k,v):
            print k + ": " + v
            keyvalues[k] = v    
        
        name = str(nametag.contents[0])
        addkeyvaluepair("Schoolname",name)
        school_details = nametag.nextSibling

        for table_row in school_details.findAll("tr"):
            table_cells = table_row.findAll("td")
            attrib = str(table_cells[0].contents[0]).rstrip(":")

            if attrib == "Address":
                lines = str(table_cells[1].contents[0]).split("\n")
                postcode = postcode_format(str(lines[-1]).replace("&nbsp;",""))
                addkeyvaluepair("Postcode",postcode)
                address = " / ".join([l.rstrip(", ") for l in lines[:-1]])
                addkeyvaluepair("Address",address)

            else:
                contents = tagcontents_to_string(table_cells[1])
                addkeyvaluepair(attrib,contents)

        datastore.save(unique_keys=["Schoolname"], data=keyvalues)
                
        print ""

    print ""
def parse_page(page):

    wrapper = page.find('div', {'id': 'print_div1'})
    for row in wrapper.findAll('tr')[1:]:
        cells = row.findAll('td')
        title = cells[0].contents[0].string
        country = cells[1].string
        funding_type = cells[2].string
        stage = cells[3].string
        start_date = datetime.strptime(cells[4].string, "%d/%m/%Y")
        total_budget = cells[5].string.replace(',', '')

        data = {
            'title': title,
            'country': country,
            'funding_type': funding_type,
            'stage': stage,
            'total_budget': total_budget,
            'start_date': start_date
        }
        datastore.save(
            unique_keys=['title', 'country', 'total_budget', 'start_date'],
            data=data,
            date=start_date)
def parse_row(element):
    
    cell_names = ['location','proposal','applicant','contact_details','anticipated_date_of_application', 'scoping_document_urls']
    cells = element.findAll('td')
    application = dict(zip(cell_names, cells))
    
    # dates are inconsistently formatted
    application['anticipated_date_of_application']  = application['anticipated_date_of_application'].text.strip()
    
    a = re.match('^.*<a href="(.*?)".*',str(application['applicant']))
    if a:
        application['applicant_url'] = unicode(a.groups()[0])
    application['applicant'] = application['applicant'].text.strip()
    
    c = re.match('^.*<a href="mailto:(.*)".*',str(application['contact_details']))
    # contact details are inconsistently formatted
    if c:
        application['contact_email'] = unicode(c.groups()[0])
    application['contact_details'] = application['contact_details'].text.strip().replace('\uf',' ')
    # cr= re.match('^(.*)([(]{0,1}[0-9].*$)',application['contact_details'].text)
    # application['contact_name'] = cr.groups(0)
    application['proposal'] = application['proposal'].text.strip()
    application['location'] = application['location'].text.strip()
    application['scoping_document_urls'] = [sl['href'] for sl in application['scoping_document_urls'].findAll('a')]
    # print json.dumps(application['scoping_document_urls'], indent=4)
    application['scoping_document_urls'] = ','.join(map(add_base, application['scoping_document_urls']))
    l = fetch_location_details(application['applicant'], application['location'])
    application.update(l)
    coords = application['latlng']
    # print coords
    del(application['latlng'])
    # print application
    try:
        datastore.save(unique_keys=['applicant','location','proposal'], data=application, latlng=coords)
    except:
        print formatExceptionInfo()
Beispiel #53
0
page = BeautifulSoup.BeautifulSoup(html)


first = page.find(True, {'class':'mvb'})
date = first.findNext('b')
fixture = date.parent.nextSibling

while date:
    while fixture and getattr(fixture, 'name', '') != 'hr':
        try:
            time = fixture.contents[-1].string
            dateob = datetime.strptime(date.string + time.strip(), "%A, %d %B %Y, %H:%M")
            home = fixture.contents[0].string
            away = fixture.contents[2].string
            data = {'date':dateob,'home':home,'away':away}
            datastore.save(unique_keys=['date','home','away'], data=data)
            fixture = fixture.nextSibling
        except (AttributeError, IndexError):
            fixture = fixture.nextSibling
    date = date.findNext('b')
    if date:
        fixture = date.parent.nextSiblingimport scraperwiki
import BeautifulSoup

from scraperwiki import datastore
from datetime import datetime

#scrape page
html = scraperwiki.scrape('http://news.bbc.co.uk/sport1/hi/football/eng_prem/fixtures/default.stm')
page = BeautifulSoup.BeautifulSoup(html)
            row_dict['document-url'] = "http://www.sendist.gov.uk/Public/" + td.find('a')['href']
        
        if len(row_dict) > 0:
            rows.append(row_dict)

    return (more, rows, viewstate)

viewstate = post_form()
more = True
page = 1
while more:
    print "Scraping page %s" % page
    (more, items, viewstate) = get_results(viewstate, page)
    page += 1
    for item in items:
        datastore.save(unique_keys=['date', 'age', 'document-url'], data=item)


import scraperwiki
import BeautifulSoup
import urllib2
import urllib
import cookielib
import datetime
import re

from scraperwiki import datastore

urlopen = urllib2.urlopen

cj = cookielib.LWPCookieJar()