def scrape_and_look_for_next_link(url, rest_id):
    try:
        html = scraperwiki.scrape(url)
    except scraperwiki.Error:
        print "Sleeping for 10 seconds..."
        time.sleep(10)
        html = scraperwiki.scrape(url)
    
    # print html
    root = lxml.html.fromstring(html)
    rest_id = scrape_restaurants(root, rest_id, base_url, city_url)
    scraperwiki.sqlite.save_var('last_rest_id', rest_id) 
    next_links = root.cssselect("ul.pagination-control li.active a")

    for link in next_links:
        if "Next" in link.text:
            next_link = link.attrib.get('href')
        else:
            next_link = None
    
    if next_link is not None:
        next_url = urlparse.urljoin(base_url, next_link)
        print next_url
        scraperwiki.sqlite.save_var('last_url_scraped', next_url) 
        scrape_and_look_for_next_link(next_url, rest_id)
    else:
        print "Finished scraping!"
def scrape_press_releases():
    releases_page = pq(scraperwiki.scrape(BASE_URL + 'news-releases'))
    for row in releases_page.find('.recordListTitle'):
        sleep(1)

        title = ''
        date = None
        content = ''
        attachments = []

        links = pq(row).find('a')
        page = pq(scraperwiki.scrape(links.eq(0).attr('href')))
        title = _extract_title_from(page)
        content = _readable(page.find('.content').html())
        date = _extract_date_from(page)
        for attachment in page.find('.file_link a'):
            att = pq(attachment)
            attachments.append({att.text(): att.attr('html')})
    
        args = [title, date, content]
        kwargs = {}
        if len(attachments):
            kwargs.update(attachments=attachments)
        
        gasp.add_press_release(*args, **kwargs)
Example #3
0
def get_os_id(params):
    """
    Function to get Open States ID.  Please do not abuse API key.
    """
    apikey = '49c5c72c157d4b37892ddb52c63d06be'
    params['apikey'] = apikey

    os_url = create_os_url(params)
    raw = scraperwiki.scrape(os_url)
    os_data = demjson.decode(raw)
    os_found = len(os_data)
    os_id = ''

    # Use first if any found, if not remove last name
    if os_found > 0:
        os_id = os_data[0]['id']
    else:
        del params['first_name']
        os_url = create_os_url(params)
        raw = scraperwiki.scrape(os_url)
        os_data = demjson.decode(raw)
        os_found = str(len(os_data)) + '-removed-first'
        if len(os_data) > 0:
            os_id = os_data[0]['id']

    return {
        'found': os_found,
        'id': os_id
    }
def geocode_postcode(postcode, provider='geonames'):
    import urllib2, json
    if provider=='geonames':
        resp = scraperwiki.scrape('http://api.geonames.org/postalCodeSearchJSON?postalcode=%s&maxRows=1&username=scraperwiki' % urllib2.quote(postcode))
        obj = json.loads(resp)
        if 'postalCodes' in obj and len(obj['postalCodes']):
            return (obj['postalCodes'][0]['lat'], obj['postalCodes'][0]['lng'])
        else:
            return (None,None)
    elif provider=='mapit':
        try:
            resp = scraperwiki.scrape('http://mapit.mysociety.org/postcode/%s.json' % urllib2.quote(postcode))
        except urllib2.HTTPError:
            return (None, None)
        else:
            obj = json.loads(resp)
            if 'wgs84_lat' in obj and 'wgs84_lon' in obj:
                return (obj['wgs84_lat'], obj['wgs84_lon'])
            else:
                return (None,None)
    elif provider=='jamiethompson':
        import requests
        try:
            resp = requests.get('http://geo.jamiethompson.co.uk/%s.json' % postcode.replace(' ',''), timeout=5).text
        except:
            return (None,None)
        else:
            obj = json.loads(resp)
            if 'geo' in obj and 'lat' in obj['geo'] and 'lng' in obj['geo']:
                return (obj['geo']['lat'], obj['geo']['lng'])
            else:
                return (None,None)
    else:
        return(None, None)
def scrapeAndSave(url):
    html = scraperwiki.scrape(url)

    root = lxml.html.fromstring(html)
    for tr in root.cssselect("div#chart_body .chart tr"):
        tds = tr.cssselect("td")
        if len(tds) == 11:
            a = tds[1].cssselect("a")[0]
            game_url = a.attrib["href"]
            game_html = scraperwiki.scrape(game_url)
            game_root = lxml.html.fromstring(game_html)
            title = game_root.cssselect("h1")[0].text_content()
            platform = (
                game_root.cssselect("table#game_infobox tr")[1].cssselect("td")[0].cssselect("a")[0].text_content()
            )
            genre = game_root.cssselect("table#game_infobox tr")[2].cssselect("td")[1].cssselect("a")[0].text_content()
            for game_tr in game_root.cssselect("div#game_table_box table tr"):
                game_tds = game_tr.cssselect("td")
                if len(game_tds) == 5:
                    data = {
                        "Title": title,
                        "Region": game_tds[2].text_content(),
                        "Release_Date": game_tds[3].text_content(),
                        "Platform": platform,
                        "Genre": genre,
                        "Publisher": game_tds[1].text_content(),
                    }
                    saveToStore(data)
def scrape_and_find_csv(url):
    html = scraperwiki.scrape(url)
    root = lxml.html.fromstring(html)
    #this selects all HTML containing link: <p class="fileicon"><a>
    csvs = root.cssselect('p.fileicon a')
    print csvs
    for link in csvs:
        #this prints the result of adding the base URL to the relative link grabbed
        #print "link.attrib.get('href')", link.attrib.get('href')
        fullurl = baseurl+link.attrib.get('href')
        print link.attrib.get('href')[-3:]
        if link.attrib.get('href')[-3:] == "csv":
            data = scraperwiki.scrape(baseurl+link.attrib.get('href'))
            reader = csv.DictReader(data.splitlines())
            for row in reader:
#                print row[' Invoice Amount '][0:]
                row['missingletter'] = row[' Invoice Amount '][0]
                row[' Invoice Amount_full'] = row[' Invoice Amount '].decode("latin-1")
#AttributeError: 'NoneType' object has no attribute 'decode'
                if row['Invoice Ref'] is not None:
                    row['Invoice Ref'] = row['Invoice Ref'].decode("latin-1")
                row[' Invoice Amount '] = row[' Invoice Amount '][2:]
                row['URL'] = fullurl
                print row
                if row['Doc Number'] is not None:
                    scraperwiki.sqlite.save(['Doc Number', 'URL'], row)
def process_journal_pdfs(parser, listurl, errors):
#    print "Finding PDFs on " + listurl
#    u = urllib.parse.urlparse(listurl)
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("div.month-entry-title a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href)
        print url
        if -1 != href.find("file://"):
#            print "Skipping non-http URL " + url
            continue
        subhtml = scraperwiki.scrape(url)
        subroot = lxml.html.fromstring(subhtml)
        subhtml = None
        for subahref in subroot.cssselect("div.related-attachements a"):
            subhref = subahref.attrib['href']
            suburl = urlparse.urljoin(url, subhref)
            if -1 == suburl.find(".pdf"):
                continue
            if parser.is_already_scraped(suburl):
                True
#                print "Skipping already scraped " + url
            else:
#                print "Will process " + url
                process_pdf(parser, suburl, errors)
def openPage(pageURL):
    locallisthtml = scraperwiki.scrape(pageURL)
    localsoup = BeautifulSoup(locallisthtml)
    trs = localsoup.find("table",{"width":"770"}).findAll("tr")
    thisi = 0
    for tr in trs:
        thisi = thisi + 1
        tds = tr.findAll("td")
        if len(tds)>1:
            thisa = tds[1].find("a")['href']
            if thisa.find("../") > - 1:
                starting_url = siteurl + thisa[3:]
            else:
                starting_url = siteurl + localfolder + "/" + thisa
            print str(thisi) + " " + starting_url
            if starting_url <> "http://www.pub-explorer.com/olpg/the-blackswan/ashover/index.htm" and starting_url.find("/http") == -1:
                html = scraperwiki.scrape(starting_url)
                soup = BeautifulSoup(html)
                ps = soup.findAll("p",{"align":"RIGHT"})
                for p in ps:
                    ptext = p.text
                    #print ptext
                    if ptext.find("Baby Chang") > - 1 or ptext.find("baby chang") > - 1:
                        if starting_url <> "http://www.pub-explorer.com/notts/pub/hutt.htm":
                            getDetails(soup)
                        break
Example #9
0
def scrape_application(link):
    #get data on agent and application person
    details_page = link
    print 'Scraping Details: ', details_page
    page_d = scraperwiki.scrape(details_page)
    tree_app = html.fromstring(page_d)
    
    #get data from decision tab page
    decision_page = link + '&theTabNo=2'
    print 'Scraping Decision: ', decision_page
    page = scraperwiki.scrape(decision_page)
    tree = html.fromstring(page)
   
    data = {
        'info_url' : link,
        'decision' : remove_characters(tree.cssselect("div#tabs_container div#tabContent div#fieldset_data p.fieldset_data")[0].text),
        'app_date' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[0].text_content()),
        'app_ref' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[1].text),
        'reg_date' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[2].text),
        'decision_date' : remove_characters(tree.cssselect("div#tabs_container div#tabContent div#fieldset_data p.fieldset_data")[1].text),
        'app_type' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[4].text),
        'ext_date' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[5].text),
        'main_loc' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[6].text),
        'desc' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[7].text),
        'full_desc' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[8].text),
        'app_status' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[9].text),
        'status_desc' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[10].text),
        'comment' : remove_characters(tree.cssselect("div#apas_form fieldset.apas div#fieldset_data p.fieldset_data")[11].text),
        'application_company' : remove_characters(tree_app.cssselect("div#tabs_container div#tabContent div#fieldset_data p.fieldset_data")[2].text),
        'agent' : remove_characters(tree_app.cssselect("div#tabs_container div#tabContent div#fieldset_data p.fieldset_data")[8].text)
        }
    print 'Scraping Complete :) NEXT! \n\n'
    scraperwiki.sqlite.save(unique_keys=['app_ref'], data=data)
def main():
    #scrape page
    borough_html = scraperwiki.scrape('http://maps.met.police.uk/php/dataview.php?area=MPS&ct=8')
    borough_page = BeautifulSoup.BeautifulSoup(borough_html)
    boroughs = extract_areas(borough_page)

    for borough in boroughs:
        ward_html = scraperwiki.scrape(borough['area_link'])
        ward_page = BeautifulSoup.BeautifulSoup(ward_html)
        wards = extract_areas(ward_page)
        for ward in wards:
            sub_ward_html = scraperwiki.scrape(ward['area_link'])
            sub_ward_page = BeautifulSoup.BeautifulSoup(sub_ward_html)
            sub_wards = extract_areas(sub_ward_page) 

            for sub_ward in sub_wards:
                crimes = extract_crime(sub_ward['area_link'])
                for crime in crimes:
                    
                    data = {
                        'borough' : borough['area_name'],
                        'ward' : ward['area_name'],
                        'sub_ward' : sub_ward['area_name'],
                        'super_output_area_code' : sub_ward['area_id'],                            
                        'month': crime['month'],
                        'crime_type': crime['crime_type'],
                        'crime_rate': crime['crime_rate'],
                        'crime_count': crime['crime_count'],                            
                        }

                    datastore.save(unique_keys=['super_output_area_code', 'month', 'crime_type'], data=data)
def congress_scrape(api_call, congress, chamber,bill_type):
    results = simplejson.loads(scraperwiki.scrape(api_call))
    results = results['results'][0]['bills']
    for bill in results:
        bill_uri = bill['bill_uri']
        bill_number = bill['number']
        title = bill['title']
        committees = bill['committees']
        num_cosponsors = bill['cosponsors']
        last_action = bill['latest_major_action']
        date_last_action = bill['latest_major_action_date']
        time.sleep(1)
        if int(num_cosponsors) > 0:
            cosponsor_call = 'http://api.nytimes.com/svc/politics/v3/us/legislative/congress/%s/bills/%s/cosponsors.json?api-key=c886aef674b84bc2ce2f20439b7fff9c:12:66229250' % (congress, bill_number.replace(".",""))
            time.sleep(2)
            print cosponsor_call
            cosponsor_results = simplejson.loads(scraperwiki.scrape(cosponsor_call))['results'][0]
            sponsor = cosponsor_results['sponsor']
            sponsor_id = cosponsor_results['sponsor_id']
            #Cosponsers: List of dictionaries
            cosponsors = cosponsor_results['cosponsors']
            for cosponsor in cosponsors:
                dict_results = {"Bill_number":bill_number,'Title':title, 'Committee':committees,'Last_action':last_action, "Date_Last_action":date_last_action, "Sponsor":sponsor, "Sponsor_ID":sponsor_id, "Congress_Number":congress,"Chamber": chamber}
                cosponsor_id = cosponsor['cosponsor_id']
                cosponsor_name = cosponsor['name']
                dict_results['Cosponsor_ID'] = cosponsor_id
                dict_results["Cosponsor_Name"] = cosponsor_name
                dict_results["Unique"] = bill_number+cosponsor_id
                scraperwiki.sqlite.save(unique_keys=['Unique'], data = dict_results)
def ParseSeatForCandidates(seat):
    jsontext = scraperwiki.scrape("http://www.yournextmp.com/seats/%s?output=json" % seat)
    contents = json.loads(jsontext)["result"]

    # get wikipedia link not in the json dump
    text = scraperwiki.scrape("http://www.yournextmp.com/seats/%s" % seat)
    constituencyurl = re.search('<a href="(http://en.wikipedia.org/wiki/[^"]*?\(UK_Parliament_constituency\))"', text).group(1)

    constituency = contents["name"]
    result = [ ]
    for candidate in contents["candidates"]:
        url = "http://www.yournextmp.com/candidates/%s" % candidate["code"]
        
        data = {"name":candidate["name"], "url":url, "party":candidate["party"]["name"], 
                 "constituency":constituency, "constituencyurl":constituencyurl}

        # get wikipedia link not in the json dump
        ptext = scraperwiki.scrape(url)
        wpurls = re.findall('<a href="(http://en.wikipedia.org/wiki/.*?)"', ptext)
        if wpurls:
            data["wpurl"] = re.sub(" ", "_", wpurls[0])
        #result.append(data)

        #print ptext
        for w in re.findall('<h3>\s*<a href="(.*?)"', ptext):
            if not re.match("http://www.labour.org.uk|http://www.conservatives.com|http://www.libdems.org.uk|http://en.wikipedia.org/", w):
                result.append(w)
        
    return result
def series_article_urls():
    html = scraperwiki.scrape('http://www.economist.com/blogs/charlemagne')
    root = lxml.html.fromstring(html)
     
    last_page_element = root.cssselect('li.pager-last a')
    
    if last_page_element:
        print last_page_element[0].get('href')
        regPat = re.compile('.*(\d{2,2})')
        found = re.match(regPat, last_page_element[0].get('href')) 
        if found: last_page_number = int(found.group(1))
        #print "Last page number:", last_page_number
    
    #http://www.economist.com/blogs/charlemagne/2012/12/french-muslims
    
    
    data = page_articles(root)
    
    for i in range(1,last_page_number+1):
        print "Page number:",i
        html = scraperwiki.scrape('http://www.economist.com/blogs/charlemagne?page=' + str(i))
        page_root = lxml.html.fromstring(html)
        new_articles = page_articles(page_root)
        #print data.items()
        #print new_article.items() 
        data = data + new_articles
        print "-------------------------"
    
    scraperwiki.sqlite.save(unique_keys=['path'], data=data)
def scrape_meeting(url):
    html = scraperwiki.scrape(url)
    
    #fjarvistir = soup.find(text=re.compile("Fjarvistarleyfi:").findParent)
    if not "Fjarvistarleyfi" in html:
        return
    else:
        root = lxml.html.fromstring(html)
        absent = {}
        absent['meeting'] = root.xpath('//h1/text()')[0]
        try:
            absent_link = root.xpath('//b[text()="Fjarvistarleyfi"]/..')[0].attrib['href']
        except:
            return
        absent['meeting_url'] = absent_link
        html = scraperwiki.scrape(absent_link)
        root = lxml.html.fromstring(html)
        absent['assembly_number'] = re.split(" ",root.xpath('//title/text()')[0])[1][:3]
        p = root.xpath('//p')
        if p:
            for x in p:
            # print x.text.strip().partition(',')[0]
            #urrg = x.text.strip().partition(',')[0].encode('utf-8')
            #print urrg
            #print string_replace(replacements,urrg)
                absent['representative'] = (string_replace(replacements,x.text.strip().partition(',')[0].encode('utf-8')).strip()).decode('utf-8')
            #absent['representative'] = re.sub(", \w.*","",p.text)
        #print absent
                scraperwiki.sqlite.save(["meeting", "assembly_number", "representative"], absent,verbose=1)
def process_journal_pdfs(parser, listurl, errors, recurse):
#    print "Finding PDFs on " + listurl
#    u = urllib.parse.urlparse(listurl)
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("div.items a"):
        url = urlparse.urljoin(listurl, ahref.attrib['href'])
        if -1 == url.find("doc_download"):
            continue
        consider_url(parser, url, errors)
        #print url
    for ahref in root.cssselect("div.item-list a"):
        suburl = urlparse.urljoin(listurl, ahref.attrib['href'])
        #print "sub " + suburl
        subhtml = scraperwiki.scrape(suburl)
        subroot = lxml.html.fromstring(subhtml)
        subhtml = None
        for subahref in subroot.cssselect("div.article a"):
            href = subahref.attrib['href']
            #print href
            subsuburl = urlparse.urljoin(suburl, href)
            #print "subsub " + subsuburl
            if -1 == subsuburl.find("doc_download"):
                continue
            consider_url(parser, subsuburl, errors)
        subroot = None
    if recurse:
        seen = { listurl : 1 }
        for ahref in root.cssselect("div.pagination a"):
            pageurl = urlparse.urljoin(listurl, ahref.attrib['href'])
            #print "P: " + pageurl
            if pageurl not in seen:
                process_journal_pdfs(parser, pageurl, errors, False)
                seen[pageurl] = 1
def scrape_member(url_base, url):
    record = {}
    council = re.match(r'(.*)\.stm', url)

    if council:
        s = '/council/' + url
        record['url'] = url_base + s
    else:
        s = url + 'contact.stm'
        record['url'] = url_base + url

    record['source_url'] = url_base + s
    soup = BeautifulSoup(scraperwiki.scrape(url_base + s))
    c = soup.find('div', id='content')
    td = c.findAll('table')[0].findAll('td')
    email = False
    offices = []

    if council:
        record['elected_office'] = 'Councillor'
        photo_url = c.find('img', 'bio_pic')['src']
        # Meta description also has representative and district names, but in once case it is incorrect.
        record['name'] = soup.find('span', {'class': 'bg90B'}).text.replace('Councillor', '').strip()
        record['district_name'] = soup.find('span', {'class': 'bg100B'}).text.replace(' Ward', '').strip()
        qs = urlparse(td[7].find('a')['href']).query
        qs = parse_qs(qs)
        rec = qs.get('Recipient', None)
        if len(rec):
            email = rec[0] + '@winnipeg.ca'
        postal = td[1]
        tel = td[3].text
        fax = td[5].text
    else:
        record['elected_office'] = 'Mayor'
        record['boundary_url'] = '/boundaries/census-subdivisions/4611040/'
        lm = soup.find('div', id='left-menu')
        l = lm.findAll('div', 'section')[1]
        photo_url = l.find('img')['src']
        record['name'] = l.find('a').text.replace('Mayor ', '').strip()
        email = '*****@*****.**'
        postal = td[5]
        tel = ''
        fax = td[1].text
        mayor_page = BeautifulSoup(scraperwiki.scrape(url_base + '/interhom/mayor/'))
        record['name'] = mayor_page.find('img',src="/interhom/Mayor/images/signature.jpg")['alt']


    postal =  '\n'.join([x.strip() for x in postal.findAll(text=True)])
    offices = []
    offices.append({
        'type': 'constituency',
        'tel': tel,
        'fax': fax,
        'postal': postal
    })
    record['offices'] = json.dumps(offices)
    record['photo_url'] = url_base + photo_url
    if email:
        record['email'] = email
    return record
def getAllEntries(base_url, city, branch):
    url = base_url + "/" +city[0] +"/"+branch+"/"
    more_results = True
    page=1
    while more_results:
        if page==1:
            html = scraperwiki.scrape(url)
        else:
            current_page = root.find_class("current")
            if current_page:
                next_page = current_page[0].getparent().getnext()
                if next_page!=None:
                    p = next_page.text_content()
                    html = scraperwiki.scrape(url+str(page)+"/")
                else:
                    more_results = False
            else:
                    more_results = False
        page=page+1
    
    
        if more_results:           
            root = lxml.html.fromstring(html)
            results = root.find_class("result-wrap")
            for result in results:
                header = result.cssselect("h2")
                name = header[0].cssselect("a")
                
                div = result.find_class("addr")
                addr = div[0].cssselect("p")

                street_address = addr[0].text_content()
                try:
                    street_address = street_address[:street_address.index("/")]
                except:
                    print ""

                print street_address
                geocode_url = 'http://maps.googleapis.com/maps/api/geocode/json?address='+urllib.quote_plus(street_address)+'&sensor=false&output=json'
                print geocode_url
                georeq = urllib2.Request(geocode_url)
                geo_response = urllib2.urlopen(georeq)
                geocode = simplejson.loads(geo_response.read())
                print geocode
                if geocode['status'] != 'ZERO_RESULTS':
                    data_lat = geocode['results'][0]['geometry']['location']['lat']
                    data_lng = geocode['results'][0]['geometry']['location']['lng']

                print data_lat
                print data_lng

                data = {
                    'city': city[1],
                    'branch': branch,
                    'business' : name[0].text_content(),
                    'address' : street_address,
                    'data_lat' : data_lat,
                    'data_lng' : data_lng
                }
                scraperwiki.sqlite.save(unique_keys=['business'], data=data)
def scrapePage(page='0'):
    html = scraperwiki.scrape("http://opendataphilly.org/opendata/?sort=name&filter=data&page="+str(page))
    root = lxml.html.fromstring(html)
    lxml.html.resolve_base_href(root) #not doing anything

    for res in root.cssselect("#results_list li.resource"):
        title = res.cssselect("#resource_title")
        desc = res.cssselect("#resource_desc")
        for link in title[0].iterlinks():
            # scrape each item i.e. http://opendataphilly.org/opendata/resource/11/topographic-contours-2ft/
            itemUrl = "http://opendataphilly.org"+str(link[2])

        itemHtml = scraperwiki.scrape(itemUrl)
        item = lxml.html.fromstring(itemHtml)

        data = {
          'title' : title[0].text_content(),
          'desc' : desc[0].text_content(),
          'itemUrl' : itemUrl
          #'itemHtml' : itemHtml
        }

        tab_data = item.cssselect("#tab_data")[0]
        for counter,info in enumerate(tab_data.cssselect("div")):
            if (counter == 0 or counter % 2):
                 key = str(info.text_content()) 
            else: 
                 data[key] = str(info.text_content())
            
        print data
        scraperwiki.sqlite.save(unique_keys=['title'], data=data)
def scrape(url):
    for n in [1, 2, 3]:
        try:
            return scraperwiki.scrape(url)
        except:
            continue
    return scraperwiki.scrape(url)
def scrape_site(start_url, domaine):
    html_content = scraperwiki.scrape(start_url)
    p_num=1
    r_num=0
    while True:
        root = lxml.html.fromstring(html_content)
        
        data_list = root.cssselect('div[class="post clearfix"]')
        
        
        if len(data_list)==0:
            print 'SPIDER-STOP'
            break
        else:
            for i in xrange(len(data_list)):
                abs_link = start_url
                scrape_info(abs_link, r_num)
                r_num+=1
            break
            
    
        for attempt in range(5):
            try:
                html_content = scraperwiki.scrape(s_url+'?Page='+str(p_num+1))
                p_num+=1
                break
            except:
                pass
def scrape_fgmarket():
    scraperwiki.sqlite.save_var("source", "www.fgmarket.com ")
    scraperwiki.sqlite.save_var("author", "Alex Maslakov") 
    root_html = lxml.html.fromstring(scraperwiki.scrape(base_domain_url))
    
    #for each outer category
    for category_item_html in root_html.cssselect("div.home-section div.home-content div.box.last-content div.box-columns div.box-column-half ul.box-column-half-inner.home-category-set  li a.home-category-main")[0:1]:
        first_page_url = get_first_page_url(category_item_html.get('href'))
        
        #parse the 1st page category html
        first_page_html_raw = lxml.html.fromstring(scraperwiki.scrape(first_page_url))
        
        #scrape each company on the first page
        premium_companies_html = first_page_html_raw.cssselect("div.category-content div.category-listings div.prem-listing")
        if len(premium_companies_html) > 0:
            for company_item_html in premium_companies_html:
                scrape_premium_company(company_item_html, 1)

        #scrape each company on the other pages
        next_page_html_raw = first_page_html_raw
        counter = 2
        while next_page_exists(next_page_html_raw):
            #next page url
            next_page_url_html_raw = next_page_html_raw.cssselect("div.category-content div.category-listings nav.pagination a")[-1]
            next_page_url = get_next_page_url(category_item_html.get('href'), next_page_url_html_raw.get('href'))
            next_page_html_raw = lxml.html.fromstring(scraperwiki.scrape(next_page_url))

            #scrape each company on the first page
            for company_item_html in next_page_html_raw.cssselect("div.category-content div.category-listings div.prem-listing"):
                scrape_premium_company(company_item_html, counter)
                       
            counter += 1 
def process_page(page_url):
    print 'Processing page', page_url
    page_html = scraperwiki.scrape(page_url)
    page_rdf  = scraperwiki.scrape('http://any23.org/any23', {'type' : 'text/html', 'format' : 'turtle', 'body' : page_html})
    page_dom  = lxml.html.fromstring(page_html)    
    
    info    = None
    email   = None
    website = None
    try:
        info = lxml.html.tostring( page_dom.cssselect('img[alt="info evento"]')[0].getparent().getnext() )
        print "PROCESS_INFO", process_info(page_url, info)
    except BaseException as e: print 'Info not found.', e
    try: 
        email = page_dom.cssselect('img[alt="e-mail"]')[0].getnext().text_content() 
    except BaseException as e: print 'EMail not found.', e
    try: 
        website = page_dom.cssselect('img[alt="Sito Web Esterno"]')[0].getnext().get('href') 
    except BaseException as e: print 'Website not found.', e
            
    if email:   
        page_rdf += '<{0}> vcard:email "{1}".\n'.format(page_url, email)
    if website: 
        page_rdf += '<{0}> vcard:url   "{1}".\n'.format(page_url, website)
    
    page_rdf = page_rdf.replace(DEFAULT_ANY23_PAGE, '<{0}>'.format(page_url))
    data = {
        'url'    : page_url,
        'turtle' : page_rdf
    }
    scraperwiki.sqlite.save(unique_keys=['url'], data=data)
def scrape(url):
    html = scraperwiki.scrape(url)
    root = lxml.html.fromstring(html)
    for tr in root.cssselect("table#listtable tr"):
        tds = tr.cssselect("td")
        if tds[0].text_content() == 'ID':
            continue
        game_url = tds[1].cssselect("a")[0].attrib["href"]
        game_html = scraperwiki.scrape(game_url)
        game_root = lxml.html.fromstring(game_html)
        title = game_root.cssselect("div#gameTitle h1")[0].text_content()
        platform = game_root.cssselect("div#gameInfo h2 a")[0].text_content()
        publisher = ""
        details = game_root.cssselect("div#gameVitals p")[0].text_content()
        genres_index = details.find('Genres')
        release_date_index = details.find('Release Date')
        genres = game_root.cssselect("div#gameVitals p")[0].text_content()[genres_index + 7 : release_date_index].strip()
        if len(game_root.cssselect("div#gameVitals p img")) == 1:
            publisher = game_root.cssselect("div#gameVitals p img")[0].attrib["title"]
        elif len(game_root.cssselect("div#gameVitals p img")) == 2:
            publisher = game_root.cssselect("div#gameVitals p img")[1].attrib["title"]
        else:
            publisher_index = details.find("Publisher")
            if(publisher_index != -1):
                publisher = details[publisher_index + 10:].strip()
        data = {
            'Title' : title,
            'Platform' : platform,
            'Publisher' : publisher,
            'Genres' : genres
        }
        saveToStore(data)
def import_meta():
    rows = scraperwiki.scrape(base_url + spine_meta).splitlines()
    rows = csv.DictReader(rows)
    for row in rows:
        scraperwiki.sqlite.save(['Variable'], row, table_name='spine_meta')
    
    rows = scraperwiki.scrape(base_url + absence_meta).splitlines()
    rows = csv.DictReader(rows)
    for row in rows:
        scraperwiki.sqlite.save(['Variable'], row, table_name='absence_meta')
    
    rows = scraperwiki.scrape(base_url + census_meta).splitlines()
    rows = csv.DictReader(rows)
    for row in rows:
        scraperwiki.sqlite.save(['Variable'], row, table_name='census_meta')
    
    rows = scraperwiki.scrape(base_url + spend_meta).splitlines()
    rows = csv.DictReader(rows)
    for row in rows:
        scraperwiki.sqlite.save(['Variable'], row, table_name='spend_meta')
    
    rows = scraperwiki.scrape(base_url + workforce_meta).splitlines()
    rows = csv.DictReader(rows)
    for row in rows:
        scraperwiki.sqlite.save(['Variable'], row, table_name='workforce_meta')
def main():
    phones = []

    querypage_url = "http://www.verkkokauppa.com/fi/s?s=1&q=&submit=Hae&category=22-658-4814&brand=Nokia"
    querypage_html = scraperwiki.scrape(querypage_url)
    querypage_root = lxml.html.fromstring(querypage_html)

    for el in querypage_root.cssselect("div.productRow"):
        stock = el.cssselect("div.stock")[0].text
        if "ennakkotilaa" not in stock:
            product_url = el.cssselect("a.productInfo")[0].attrib["href"]
            phones.append({'url':product_url})

    for phone in phones:
        url_parts = phone["url"].split("/")
        phone_model = url_parts[-1]
        product_code = url_parts[-3] + url_parts[-2]
        phone_html = scraperwiki.scrape(phone["url"])
        phone_root = lxml.html.fromstring(phone_html)
        for el in phone_root.cssselect("div#latestSoldList li"):
            if el.text:
                try:
                    sold_date = datetime.strptime(el.text[3:], "%d.%m.%Y %H:%M")
                except:
                    sold_date = parse_sold_text(el.text)
                if sold_date:
                    sold_pieces = el.cssselect("span")[0].text
                    sold_pieces = [int(s) for s in sold_pieces.split() if s.isdigit()][0] # because "8 kpl", "yli 20 kpl" cases
                    scraperwiki.sqlite.save(unique_keys=['pcode','date'], data=({"pcode":product_code,"model":phone_model,"date":sold_date,"pieces":sold_pieces}))
def scrape_saint_data(url):
    html = scraperwiki.scrape(url)
    soup = BeautifulSoup(html)

    if html.startswith('<script language="JavaScript">'):
        found = re.search('href=\".*\"', soup.contents[0].contents[0].string).span()
        url = soup.contents[0].contents[0].string[found[0]+6:found[1]-1]
        print url
        html = scraperwiki.scrape(url)
        soup = BeautifulSoup(html)

    title = soup.find("h1").contents[0].string
    if re.search('Saints.SQPN.com', title):
        title = soup.find("h2").contents[0].contents[0]
    print title

    # Now the data
    names = soup.findAll("ins")
    for name in names:
        if name.find("a"):
            name = name.find("a")
        print name.contents[0]
        next = name
        while next.nextSibling==None:
            next = next.parent
        print next
        while type(next.nextSibling).__name__ == 'NavigableString':
            next = next.nextSibling
        print next.contents[0]
def main():
    decoder = json.JSONDecoder()

    # 1000000 is a stupidly large number?
    scrapers = decoder.decode(scraperwiki.scrape("https://api.scraperwiki.com/api/1.0/scraper/search?format=jsondict&maxrows=1000000"))
    # would be nicer if there was an "offset" to avoid generating superhuge JSON objects
    # even nicer if we could do it incrementally

    r = Relation()

    for d in scrapers:
        short_name = d["short_name"]
        if short_name[-8:] == ".emailer":
            continue
        print short_name
        details = decoder.decode(scraperwiki.scrape("https://api.scraperwiki.com/api/1.0/scraper/getinfo?format=jsondict&name=%s&version=-1&quietfields=runevents%%7Cdatasummary%%7Cuserroles%%7Chistory"%short_name))[0]
        code = details["code"]

        # search code for mentions views
        v = re.compile("views\\.scraperwiki\\.com/run/([A-Za-z0-9_]*)")
        for other in re.findall(v,code):
            scraperwiki.sqlite.save(unique_keys=["from","to","type"],data={"from":short_name, "to":other, "type":"mention"})

        # prepare table of hashes to find common code        
        for h in rolling_hash(stripped_python(code)):
            r.relate(h,short_name)

        # find attachments
        for other in details["attachables"]:
            scraperwiki.sqlite.save(unique_keys=["from","to","type"],data={"from":short_name, "to":other, "type":"attachment"})

    # now find the common code
    for ((x,y),n) in r.pairs_by_keys_in_common().iteritems():
        scraperwiki.sqlite.save(unique_keys=["from","to","type"],data={"from":x, "to":y, "type":"common code", "strength":n})
def do_scrape():
    az_html = scraperwiki.scrape('http://www.lambeth.gov.uk/Services/')
    list_root = lxml.html.fromstring(az_html)
    for a in list_root.cssselect("div.AZ li a"):
        try:
            page_title =  a.text
            page_link = 'http://www.lambeth.gov.uk' +  a.get('href')

            print "scraping " + page_link 
            page_full_html = scraperwiki.scrape(page_link)
            page_root = lxml.html.fromstring(page_full_html)

            #pull out the section details
            print page_root.cssselect('div.breadCrumb a')[2].text
            sections_csv = page_root.cssselect('div.breadCrumb a')[2].text

            #check it is a content page, not a nav page
            if page_full_html.find('cScape.Lambeth.GenericTemplates/ServiceCategory.aspx') <0 and page_full_html.find('cScape.Lambeth.GenericTemplates/DocumentSummary.aspx') <0 and page_full_html.find('cScape.Lambeth.GenericTemplates/GroupDocument.aspx') <0:

                content_fragment = page_root.cssselect('div.page')[0]
                for toplink in content_fragment.cssselect('div.topLink'):
                    content_fragment.remove(toplink)
                content_html = lxml.html.tostring(content_fragment)  
                content_html = clean_html(content_html)


                scraperwiki.sqlite.save(unique_keys=["source_url"], data={"source_url":page_link, "title":page_title, "content": content_html, 'sections_csv': sections_csv})
            else:
                print "ignoring nav page"
        except:
            print "something went wrong"
            pass
def process_journal_pdfs(parser, listurl, errors):
#    print "Finding PDFs on " + listurl
#    u = urllib.parse.urlparse(listurl)
    html = scraperwiki.scrape(listurl)
    root = lxml.html.fromstring(html)
    html = None
    for ahref in root.cssselect("div#placeholder-content-main-left-column a"):
        href = ahref.attrib['href']
        url = urlparse.urljoin(listurl, href)
        if -1 != href.find("file://") or -1 == url.find('/postjournal/article'):
#            print "Skipping non-http URL " + url
            continue
        subhtml = scraperwiki.scrape(url)
        subroot = lxml.html.fromstring(subhtml)
        subhtml = None
        for subahref in subroot.cssselect("div.article-content a"):
            subhref = subahref.attrib['href']
            suburl = urlparse.urljoin(listurl, subhref)
            if -1 == suburl.find(".pdf"):
                continue
            if parser.is_already_scraped(suburl):
                True
#            print "Skipping already scraped " + suburl
            else:
#                print "Will process " + suburl
                process_pdf(parser, suburl, errors)
def scrape_site(start_url, domaine):
    html_content = scraperwiki.scrape(start_url)
    p_num=1
    r_num=0
    while True:
        root = lxml.html.fromstring(html_content)
        data_list = root.cssselect('td[class="lightbg exhlist_company"]')
        
        if len(data_list)==0:
            print 'SPIDER-STOP'
            break
        for i in range(len(data_list)):
            temp_link = data_list[i].attrib.get('onclick')
           
            rel_link = temp_link.split('(')[1].split(',')[0].replace("'","")
            abs_link = domaine+rel_link
            
            scrape_info(abs_link, r_num)
            r_num+=1

        for attempt in range(5):
            try:
                html_content = scraperwiki.scrape(s_url+'?Page='+str(p_num+1))
                p_num+=1
                break
            except:
                pass
import scraperwiki
import simplejson
import urllib2


QUERY = 'crref OR chref OR ccref OR chref12'
GEOINFO = '53.5,-8,257km'
RESULTS_PER_PAGE = '100'
LANGUAGE = 'en'
NUM_PAGES = 15 

for page in range(1, NUM_PAGES+1):
    base_url = 'http://search.twitter.com/search.json?q=%s&geocode=%s&rpp=%s&lang=%s&page=%s' \
         % (urllib2.quote(QUERY), urllib2.quote(GEOINFO), RESULTS_PER_PAGE, LANGUAGE, page)
    try:
        results_json = simplejson.loads(scraperwiki.scrape(base_url))
        for result in results_json['results']:
            data = {}
            data['id'] = result['id']
            data['text'] = result['text']
            data['from_user'] = result['from_user']
            print data['from_user'], data['text']
            scraperwiki.sqlite.save(["id"], data) 
    except:
        print 'Oh dear, failed to scrape %s' % base_url
        
    ###################################################################################
# Twitter API scraper - designed to be forked and used for more interesting things
###################################################################################

import scraperwiki
import scraperwiki
import lxml.etree
import re
from geopy import geocoders

url = "http://www.regionofwaterloo.ca/en/gettingAround/resources/RegionalTrafficVolumes-AllLocations.pdf"
pdfdata = scraperwiki.scrape(url)
print "The pdf file has %d bytes" % len(pdfdata)

xmldata = scraperwiki.pdftoxml(pdfdata)
print "After converting to xml it has %d bytes" % len(xmldata)
print "The first 2000 characters are: ", xmldata[:2000]

root = lxml.etree.fromstring(xmldata)
pages = list(root)

localMuni = ''
lastStreet = ''
count = 0
munis = {'CAM': 'Cambridge', 
         'KIT': 'Kitchener', 
         'NDF': 'North Dumfries', 
         'WAT': 'Waterloo', 
         'WIL': 'Wilmot', 
         'WEL': 'Wellesley', 
         'WOO': 'Woolwich'}

g = geocoders.Google(domain='maps.google.ca')

for page in pages:
    for el in list(page):
def process_catalog_page(page_index):
    catalog_page_html = scraperwiki.scrape(CATALOG_PAGINATION +
                                           str(page_index))
    catalog_page_dom = lxml.html.fromstring(catalog_page_html)
    for dataset_link in catalog_page_dom.cssselect('.cerca_td_nome a'):
        process_dataset_page(dataset_link.get('href'))
Example #34
0
import scraperwiki
html = scraperwiki.scrape('http://inmo.ie/6022')
import lxml.html
root = lxml.html.fromstring(html) # turn our HTML into an lxml object
tds = root.cssselect('td') # get all the <td tags
for td in tds:
    print td.text_content()                # just the text inside the HTML tag
for td in tds:
     record = { "td" : td.text_content() } # column name and value
     scraperwiki.sqlite.save(["td"], record) # save the records one by one
Example #35
0
import scraperwiki
from bs4 import BeautifulSoup  # documentation at http://www.crummy.com/software/BeautifulSoup/bs4/doc/

scraperwiki.sqlite.attach(
    "dfid-contracts"
)  # Attaching scraper https://scraperwiki.com/scrapers/dfid-contracts/

links = scraperwiki.sqlite.select(
    "URL from `dfid-contracts`.swdata"
)  # Selecting the URLs collected from the search results from contract finder

# Getting the html from the links
for link in links:
    url = link["URL"]
    print url
    html = scraperwiki.scrape(url)
    soup = BeautifulSoup(html)
    #print soup

    title = soup.find("h2", "legend-edit").get_text().replace("English",
                                                              "").strip()

    # Paragraph tags contain information like reference, duration, nature of contract, etc.
    ps = soup.find_all("p", "clearfix")
    for p in ps:
        span_text = ""
        span = p.find("span")
        if span:
            span_text = span.get_text().strip()
            span.clear()
        if span_text != "":
Example #36
0
###############################################################################
# Air pollution monitoring site scraper
###############################################################################

import scraperwiki
import lxml.html
import time
import datetime

today_date = str(datetime.datetime.now())

# retrieve the page
starting_url = 'http://www.londonair.org.uk/london/asp/publicstats.asp?region=0&site=MY7&bulletin=hourly&la_id=&statyear=2011&postcode=&MapType=Google&zoom=9&lat=51.431751825946115&lon=-0.17578125&Species=All'
html = scraperwiki.scrape(starting_url)

# get all the td tags in the stats table
root = lxml.html.fromstring(html)
tds = root.cssselect('table#sitestatssub td span')

# save them
i = 1
for td in tds:
    record = {"date_scraped": today_date, "td": td.text, "no": i}
    print record
    scraperwiki.sqlite.save(["td"], record)
    i = i + 1
###############################################################################
# Air pollution monitoring site scraper
###############################################################################

import scraperwiki
import scraperwiki
import lxml.html 
import datetime


html = scraperwiki.scrape("http://mg.co.za/zapiro/")
root = lxml.html.fromstring(html)

title = root.cssselect("li.last_crumb a")[0].text_content()
link = root.cssselect("div#cartoon a")[0].attrib['href']
cartoon = root.cssselect("div#cartoon_full_size img")[0].attrib['src']

now = datetime.datetime.now()

data = {
    'link': link,
    'title': "Zapiro:" + title,
    'description': '<img src="'+cartoon+'" border="0" /><br /><br />Like Zapiro? Then check out my <a href="http://feeds.feedburner.com/MadamEve">Unofficial Madam &amp; Eve Feed</a>. All Madam &amp; Eve cartoons are taken directly from the <a href="http://www.madamandeve.co.za">official website</a> as it is updated.',
    'pubDate': str(now) ,
}
scraperwiki.sqlite.save(unique_keys=['link'],data=data)import scraperwiki
import lxml.html 
import datetime


html = scraperwiki.scrape("http://mg.co.za/zapiro/")
root = lxml.html.fromstring(html)

title = root.cssselect("li.last_crumb a")[0].text_content()
link = root.cssselect("div#cartoon a")[0].attrib['href']
cartoon = root.cssselect("div#cartoon_full_size img")[0].attrib['src']
import scraperwiki
html = scraperwiki.scrape("http://pretraga2.apr.gov.rs/EnterprisePublicSearch/details/EnterpriseBusinessName/1335330?code=620B52581BD2D831C00E2A0D0DC04739C1C3B6FB")
print htmlimport scraperwiki
html = scraperwiki.scrape("http://pretraga2.apr.gov.rs/EnterprisePublicSearch/details/EnterpriseBusinessName/1335330?code=620B52581BD2D831C00E2A0D0DC04739C1C3B6FB")
print html
Example #39
0
import scraperwiki
import lxml.html

html = scraperwiki.scrape("https://scraperwiki.com/")
root = lxml.html.fromstring(html)

for el in root.cssselect("div.tags li a"):
    print el.text, el.attrib['href']

    html_for_tag_page = scraperwiki.scrape("https://scraperwiki.com" +
                                           el.attrib['href'])
    root_for_tag_page = lxml.html.fromstring(html_for_tag_page)

    matching_python_scrapers = 0
    for tag_el in root.cssselect("table.code_about tr.python"):
        matching_python_scrapers = matching_python_scrapers + 1

    scraperwiki.sqlite.save(unique_keys=['url'],
                            data={
                                'title': el.text,
                                'url': el.attrib['href'],
                                'num_python_scrapers': matching_python_scrapers
                            })

import scraperwiki
import lxml.html

html = scraperwiki.scrape("https://scraperwiki.com/")
root = lxml.html.fromstring(html)

for el in root.cssselect("div.tags li a"):