コード例 #1
0
def urltohtml(url="http://www.madingley.org/uploaded/Hansard_08.07.2010.pdf"):
    import scraperwiki, urllib2, lxml.etree
    lazycache=scraperwiki.swimport('lazycache')
    pdfdata = lazycache.lazycache(url)

    xmldata = scraperwiki.pdftoxml(pdfdata)
    root = lxml.etree.fromstring(xmldata)
    pages = list(root)
    
    # this function has to work recursively because we might have "<b>Part1 <i>part 2</i></b>"
    def gettext_with_bi_tags(el):
        res = [ ]
        if el.text:
            res.append(el.text)
        for lel in el:
            res.append("<%s>" % lel.tag)
            res.append(gettext_with_bi_tags(lel))
            res.append("</%s>" % lel.tag)
            if el.tail:
                res.append(el.tail)
        return "".join(res)
    
    # print the first hundred text elements from the first page
    text=[]
    for page in pages:
        for el in list(page)[:100]:
            if el.tag == "text":
                text.append(gettext_with_bi_tags(el))
    return '\n'.join(text)
コード例 #2
0
def load():
    index = [u'shipment_year', u'taxon_family']
    data = scraperwiki.swimport('csv2sw').read.csv('http://hacks.thomaslevine.com/top10.csv')
    for row in data:
        del(row[''])
    
    scraperwiki.sqlite.save(index, data, 't')
コード例 #3
0
def main():
  #Load
  xml=swimport('dsp').dsp('http://www.vbsmutualbank.co.za/ContactUs/ContactUs.htm')

  #Parse
  branches=xml.xpath('//table[@width="556"]/tr[position()>1 and position()<last()]')
  branches_text=[branch.xpath('td[position()=last()]')[0].text_content() for branch in branches]
  d=[parse_branch_text(t) for t in branches_text]
コード例 #4
0
def main():
  #Load
  xml=swimport('dsp').dsp('http://www.khula.org.za/Admin/Contacts/RegionalContacts.aspx',False)

  #Parse
  t_nodes=xml.xpath('//table[@width="100%"]')
  assert len(t_nodes)==1
  table=t_nodes[0]
  d=parse_table(table)
  t=time()
  for row in d:
    row["date_scraped"]=t
  d=moreparsing(d)
  save([],d,'final')
コード例 #5
0
def geocode():
    if "scraped" not in show_tables():
        d = swimport('csv2sw').read.csv('https://views.scraperwiki.com/run/combine_mix_scraper_spreadsheets/')
        save([], d, 'scraped')

    if "address" not in show_tables():
        initialize()

    while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0:
        address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0]

        #print address
        if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0:
            d = all_services(address['address-input'])
            for row in d:
                row['address-input'] = address['address-input']
            save([], d, 'geocode')
        params = (address['address-column'], address['address-input'])
        execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params )
        commit()
def geocode():
    if "scraped" not in show_tables():
        d = swimport('csv2sw').read.csv('http://hacks.thomaslevine.com/all.csv')
        save([], d, 'scraped')
        execute('DELETE FROM `scraped` WHERE `Country` != "South Africa"')
        commit()

    if "address" not in show_tables():
        initialize()

    while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0:
        address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0]

        #print address
        if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0:
            d = all_services(address['address-input'])
            for row in d:
                row['address-input'] = address['address-input']
            save([], d, 'geocode')
        params = (address['address-column'], address['address-input'])
        execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params )
        commit()
コード例 #7
0
from scraperwiki import swimport
from scraperwiki.sqlite import save
keyify=swimport('keyify').keyify
dsp=swimport('dsp').dsp
from time import time
from copy import copy
DATE=time()
strip_address = swimport('strip_address').strip_address

URLBASE="http://www.marang.co.za/"
class ParseError(Exception):
  pass

#Main controls
def main():
  links=get_links()
  for path in links:
    #Select only the interesting ones
    #And something's wrong with the history page.
    if "-" in path and path!='marang-history.asp':
      url='%s%s'%(URLBASE,path)
      print "Downloading %s" % url
      xml=dsp(url,False)

      if path=="branch-regional.asp":
        parse_regional_offices(xml,url)
      elif path=="marang-map.asp":
        parse_map(xml)
      elif path[0:6]=="branch":
        try:
          parse_branch(xml,url,path)
# -*- coding: UTF-8 -*-
import scraperwiki
import json
from BeautifulSoup import BeautifulSoup
import datetime
import dateutil.parser
import lxml.html
import resource
import sys
import urlparse
import re

# Make sure Scraperwiki believe this is the source from this database
scraperwiki.scrape("http://www.hive.no/postjournal/")

lazycache = scraperwiki.swimport("lazycache")
postlistelib = scraperwiki.swimport("postliste-python-lib")

agency = "Høgskolen i Vestfold"


def report_errors(errors):
    if 0 < len(errors):
        print "Errors:"
        for e in errors:
            print e
        exit(1)


def out_of_cpu(arg, spent, hard, soft):
    report_errors(arg)
コード例 #9
0
# Blank Python

import scraperwiki
utils=scraperwiki.swimport('hildenae_utils')
import lxml.html
from lxml import etree

#scraperwiki.utils.httpresponseheader("Content-Type", 'text/plain; charset="utf-8"')

#scraperwiki.sqlite.attach("sopp-middag-view", "src")


def extractTable(root):
    for el in root.cssselect("div.content table.contentpaneopen table"):
        tableSource = lxml.html.tostring(el)
        if "Ukedag" in tableSource:
            return el        

def cleanup(table):
    etree.strip_tags(table,'span','strong','div', 'tbody')
    for tag in table.iter():
        for att in tag.attrib.keys():
            tag.attrib.pop(att)
        if tag.tag == "table": tag.set('border','1')
    return table;

def tds(td):
    tdstr = lxml.etree.tostring(td)
    cleaned = tdstr.replace('&#13;','').replace('&#160;', ' ').replace('/n', '').replace('</td>', '').replace('<td>', '').replace('<p>','').replace('<br />','<br>');
    cleaned2 = utils.removeDoubleSpaces(cleaned).replace('</p>','<br>').replace('<br><br>','<br>').replace('> ','>').replace(' <','<');
    if cleaned2.endswith("<br>"):
コード例 #10
0
import scraperwiki
import sys
import urllib
import lxml.etree
import re
import datetime

wikipedia_utils = scraperwiki.swimport("wikipedia_utils")

nowtime = datetime.datetime.now()

def UpdateCategory():
    ldata = wikipedia_utils.GetWikipediaCategoryRecurse("Caves_of_the_United_Kingdom")
    for data in ldata:
        data["updatedate"] = nowtime
    scraperwiki.sqlite.save(["title"], ldata, "cavepages")
    scraperwiki.sqlite.execute("delete from cavepages where updatedate<>?", nowtime.isoformat())

def ExtractInfo():
    rdata = scraperwiki.sqlite.select("title, link from cavepages")
    ldata = [ ]
    for pdata in rdata:
        try:
            val = wikipedia_utils.GetWikipediaPage(pdata["title"])
        except IOError:
            print "Skipping", pdata["title"]
            continue
        res = wikipedia_utils.ParseTemplates(val["text"])
        #print dict(res["templates"]).keys()
        data = dict(res["templates"]).get("Infobox ukcave")
コード例 #11
0
import scraperwiki

search = scraperwiki.swimport('twitter_search_extended').search

search(['viaplay'], num_pages=25)
import scraperwiki

search = scraperwiki.swimport('twitter_search_extended').search

search(['viaplay'], num_pages=25)
コード例 #12
0
import scraperwiki
search = scraperwiki.swimport("twitter_search_extended").search
search(["olympics"], num_pages=5)
# Blank Python

import scraperwiki
search = scraperwiki.swimport("twitter_search_extended").search
search(["olympics"], num_pages=5)
# Blank Python

コード例 #13
0
qdict = dict(cgi.parse_qsl(os.getenv("QUERY_STRING", "")))
url = qdict.get("url", 'http://www.wellcome.ac.uk/stellent/groups/corporatesite/@msh_peda/documents/web_document/wtd003419.pdf')

try:
    sdata = scraperwiki.sqlite.select("url, pages, isbns, final_isbn from swdata where url=? limit 1", (url,))
except scraperwiki.sqlite.SqliteError:
    sdata = [ ]

if sdata:
    data = sdata[0]
    data = { "url":sdata[0]["url"], "pages":sdata[0]["pages"], 
             "isbns":json.loads(sdata[0]["isbns"]), "final_isbn":sdata[0]["final_isbn"] }

else:
    lazycache=scraperwiki.swimport('lazycache')
    pdfdata=lazycache.lazycache(url)
    xmldata = scraperwiki.pdftoxml(pdfdata)
    root=lxml.html.fromstring(xmldata)
    pages = list(root)
    isbns = [ ]
    fisbn = ""
    for i, page in enumerate(pages):
        pagetext = " ".join(t.text_content()  for t in page)
        for isbn_org, isbn in re.findall("(ISBN[\s:]*([\d\s\-]+))(?i)", pagetext):
            isbn = re.sub("[^\d]", "", isbn)
            if not stdnum.isbn.is_valid(isbn):
                isbn = ""
            else:
                fisbn = isbn
            isbns.append({"isbn_org":isbn_org, "isbn":isbn})
コード例 #14
0
import simplejson
import urllib2
from scraperwiki import swimport


def get_followers(twitter_handle):
    base_url = 'https://api.twitter.com/1/followers/ids.json?cursor=-1&screen_name=' + twitter_handle
    results_json = simplejson.loads(scraperwiki.scrape(base_url))
    return results_json['ids']


followers = get_followers("gormno")

myfollowers_str = map(str, followers)

swimport('twitter_bulk_users_lookup_3').bulklookup("gormno", myfollowers_str)

followers = get_followers("gormer")

myfollowers_str = map(str, followers)

swimport('twitter_bulk_users_lookup_3').bulklookup("gormer", myfollowers_str)
import scraperwiki
import simplejson
import urllib2
from scraperwiki import swimport


def get_followers(twitter_handle):
    base_url = 'https://api.twitter.com/1/followers/ids.json?cursor=-1&screen_name=' + twitter_handle
    results_json = simplejson.loads(scraperwiki.scrape(base_url))
from scraperwiki import swimport 

swimport('twitter_search_extended').search(['#actuary'], num_pages = 5)from scraperwiki import swimport 

swimport('twitter_search_extended').search(['#actuary'], num_pages = 5)
コード例 #16
0
from scraperwiki import swimport
from scraperwiki.sqlite import save, execute, commit, show_tables
from lxml.html import fromstring, tostring
from requests import get
options=swimport('options').options
keyify=swimport('keyify').keyify
randomsleep=swimport('randomsleep').randomsleep
from time import time
import re
strip_address = swimport('strip_address').strip_address

DATE=time()

DOMAIN = 'http://www.thuthukani.co.za'

def getprovincepages():
    html = fromstring(get('http://www.thuthukani.co.za/branches-mpumalanga.php').content)
    selects = html.cssselect('select')
    assert 1 == len(selects)
    d = options(selects[0], textname = 'province', valuename = 'url')
    for row in d:
        row['url'] = DOMAIN + '/' + row['url']
        row['date_scraped'] = DATE
    return d

def parseprovincepage(province):
    raw = get(province['url']).content
    withspaces = raw.replace('<br />', '\n').replace('<font', '\n\n<font')
    html = fromstring(withspaces)
    fonts = html.xpath('//td[@valign="top"]/font[img[@src="images/branches.jpg"]]')
    assert 1==len(fonts)
コード例 #17
0
import scraperwiki
import urllib2
import urlparse
import re
import os
import time

filename = ""

# to do.  
#  Detect dates
#  Do equates
#  Detect use of DistoX to distinguish computer generated numbers
print "2"
irebyurl = "http://cave-registry.org.uk/svn/Yorkshire/mmmmc/survexdata/all.svx"
treextract = scraperwiki.swimport("apache_directory_tree_extractor")
print treextract.ParseSVNRevPageTree("http://cave-registry.org.uk/svn/Yorkshire/mmmmc/survexdata/")
print "3"
allteam = set()


def Main():
    survexblock = SurvexBlock(name="root", survexpath="caves", parent=None, begin_char=0, cave="Ireby", survexfile=irebyurl, totalleglength=0.0)
    fin = GetFile(irebyurl)
    textlines = [ ]
    #print "before", fin
    RecursiveLoad(survexblock, irebyurl, fin, textlines)
    return survexblock

def GetFile(url):
    try:
コード例 #18
0
from scraperwiki import swimport
TABLE_NAME='nytm_sos'

swimport('meetup').scrape('http://www.meetup.com/ny-tech/events/47879702/',TABLE_NAME)
swimport('swversion').swversion(TABLE_NAME)

print """
How to get the current table:
SELECT * from (SELECT value_blob FROM `swvariables` WHERE name="nytm_sos_current");
"""from scraperwiki import swimport
TABLE_NAME='nytm_sos'

swimport('meetup').scrape('http://www.meetup.com/ny-tech/events/47879702/',TABLE_NAME)
swimport('swversion').swversion(TABLE_NAME)

print """
How to get the current table:
SELECT * from (SELECT value_blob FROM `swvariables` WHERE name="nytm_sos_current");
"""
コード例 #19
0
import scraperwiki

from scraperwiki import swimport

swimport('twitter_scrape').statuses('regjeringen', 'departementene', 1)

import scraperwiki

from scraperwiki import swimport

swimport('twitter_scrape').statuses('regjeringen', 'departementene', 1)

コード例 #20
0
from urllib2 import urlopen
from lxml.html import fromstring, tostring
import scraperwiki
import datetime

keyify = scraperwiki.swimport('keyify').keyify

#call the page
page = urlopen('http://dashboard.ed.gov/statecomparison.aspx?i=j&id=0&wt=40')
rawtext = page.read()
html = fromstring(rawtext)

#Name the table you are trying to build
tables = html.cssselect('table')
#Call the first table
table = tables[0]

#print the entire table
print tostring(table)

#get the rows inside table 0
rows = table.cssselect('tr')
headers = [th.text_content() for th in rows[0].cssselect('th')]

#pull headers from database
expected_headers = [
    'State', 'Total', u'\xa0', 'White', 'Black', 'Hispanic', 'Asian',
    'Native Hawaiian/Pacific Islander', 'American Indian/Alaska Native',
    'Two or more races'
]
assert headers == expected_headers, headers
コード例 #21
0
from time import time
from scraperwiki.sqlite import save,select
from scraperwiki import swimport
from lxml.etree import fromstring
from urllib2 import urlopen
import re
strip_address = swimport('strip_address').strip_address

DATE=time()
URL="http://www.saccol.org.za/saccos_in_saccol.php"

def main():
  d=download()
  save([],d,'initial')
  d=clean()
  save([],d,'final')

def download():
  #Load page
  raw=urlopen(URL).read()
  cleaned=clean_page(raw)

  #Load table
  table=fromstring(cleaned)
  d=parse_table(table)
  return d

def clean():
  d=select('* from `initial`')
  for row in d:
    row['date_scraped']=DATE
コード例 #22
0
def main():
  x=swimport('dsp').dsp(URL,False)
  satellite=x.xpath('//div[@class="grid_3 alpha omega block"]/p/text()')
  branch=x.xpath('//div[@class="grid_6 alpha omega block"]/p/text()')
  save(["satellite"],[{"date_scraped":DATE,"satellite":clean(s),"branch":clean(b)} for (s,b) in zip(satellite,branch)],"final")
コード例 #23
0
import scraperwiki
import json
import requests
import lxml.html
import itertools
import datetime

swutils=scraperwiki.swimport('swutils')
user_agent = swutils.get_user_agent()
    
requests.defaults.defaults['max_retries'] = 10

def main():
    
    print 'scraping November 2012 json feed'
    
    json_url = "http://ec2-46-51-135-144.eu-west-1.compute.amazonaws.com/gaza/Nov2012/json"
    json_response = requests.get(json_url).text
    json_dict = json.loads(json_response)
    
    batch = []
    alreadygot = []
    try:
        rows = scraperwiki.sqlite.select('id from features')
        for row in rows:
            alreadygot.append(row['id'])
    except:
        pass

    if len(json_dict['features']) - len(alreadygot) > 0:
        print 'scraping details for %s new features' % (len(json_dict['features']) - len(alreadygot))
コード例 #24
0
# See also
# https://views.scraperwiki.com/run/pdf-to-html-preview-1/?url=http%3A%2F%2Fwww.stortinget.no%2FGlobal%2Fpdf%2Fpostjournal%2Fpj-2012-05-09.pdf

import scraperwiki
import json
from BeautifulSoup import BeautifulSoup
import datetime
import dateutil.parser
import lxml.html
import urlparse
import re

scraperwiki.scrape("http://www.bydel-ullern.oslo.kommune.no/postjournal/")

#lazycache=scraperwiki.swimport('lazycache')
postlistelib=scraperwiki.swimport('postliste-python-lib')

agency = 'Oslo kommune, Ullern bydel'

def report_errors(errors):
    if 0 < len(errors):
        print "Errors:"
        for e in errors:
            print e
        raise ValueError("Something went wrong")

def out_of_cpu(arg, spent, hard, soft):
    report_errors(arg)

def process_pdf(parser, pdfurl, errors):
    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
コード例 #25
0
from scraperwiki.sqlite import save, select, execute, save_var, get_var, commit
from scraperwiki import swimport
from requests import session
from lxml.html import fromstring, tostring
import re
from time import time, sleep
keyify=swimport('keyify').keyify

URL="http://www.ncr.org.za/register_of_registrants/index.php"

#DEV=True
DEV=False

DATE = get_var('DATE', time())

RE={
  'leftpadding':re.compile(r'^ *')
, 'rightpadding':re.compile(r' *$')
}

def cp1():
  execute('''
CREATE TABLE IF NOT EXISTS `businessPremises` (
  `date_scraped` REAL,
  `businessPremisesURL` TEXT,
  FOREIGN KEY (date_scraped, businessPremisesUrl)
  REFERENCES cp1(date_scraped, businessPremisesUrl)
)
''')

  if get_var('crashed') == 1:
コード例 #26
0
from scraperwiki import swimport
from scraperwiki.sqlite import save
keyify = swimport('keyify').keyify
dsp = swimport('dsp').dsp
from time import time
from copy import copy
DATE = time()
strip_address = swimport('strip_address').strip_address

URLBASE = "http://www.marang.co.za/"


class ParseError(Exception):
    pass


#Main controls
def main():
    links = get_links()
    for path in links:
        #Select only the interesting ones
        #And something's wrong with the history page.
        if "-" in path and path != 'marang-history.asp':
            url = '%s%s' % (URLBASE, path)
            print "Downloading %s" % url
            xml = dsp(url, False)

            if path == "branch-regional.asp":
                parse_regional_offices(xml, url)
            elif path == "marang-map.asp":
                parse_map(xml)
コード例 #27
0
#Scraperwiki som hentar veglengder frå veglistene til vegvesenet
#Av Gnonthgol

import scraperwiki
import urllib2
import lxml.etree
import re
utils=scraperwiki.swimport('hildenae_utils') # for caching + pretty-print

res = {}

#Henta frå http://www.vegvesen.no/Kjoretoy/Yrkestransport/Veglister+og+dispensasjoner/Veglister+2012
url = "http://www.vegvesen.no/_attachment/314828/binary/553942"

xmldata = utils.findInCache(url)
if xmldata is None:
    pdfdata = urllib2.urlopen(url).read()
    xmldata = scraperwiki.pdftoxml(pdfdata)
    utils.putInCache(url, xmldata)

root = lxml.etree.fromstring(xmldata)
pages = list(root)

for page in pages:
    ref = None
    for el in list(page):
        if el.tag == "text" and el.text:
            if el.text.find('FV ') == 0 or el.text.find('KV ') == 0:
                ref = el.text.strip(" *")
            if ref and re.match("(\d*,\d{3})", el.text):
                if not res.has_key(ref):
コード例 #28
0
# -*- coding: UTF-8 -*-
import scraperwiki
import json
from BeautifulSoup import BeautifulSoup
import datetime
import dateutil.parser
import lxml.html
import resource
import sys
import urlparse
import re

# Make sure Scraperwiki believe this is the source from this database
scraperwiki.scrape("http://hist.no/content/34570/Offentleg-Journal")

lazycache = scraperwiki.swimport('lazycache')
postlistelib = scraperwiki.swimport('postliste-python-lib')

agency = 'Høgskolen i Vestfold'


def report_errors(errors):
    if 0 < len(errors):
        print "Errors:"
        for e in errors:
            print e
        exit(1)


def out_of_cpu(arg, spent, hard, soft):
    report_errors(arg)
import scraperwiki
import simplejson
import urllib2
from scraperwiki import swimport

myfollowers = []
twitter_handle = 'GotToDance_Sky1'

base_url = 'https://api.twitter.com/1/followers/ids.json?cursor=-1&screen_name=' + twitter_handle 
results_json = simplejson.loads(scraperwiki.scrape(base_url))
myfollowers = results_json['ids']
myfollowers_str = map(str, myfollowers) 



swimport('twitter_bulk_users_lookup').bulklookup(myfollowers_str)


'''
See https://scraperwiki.com/scrapers/twitter_bulk_users_lookup/ for the code for the script

Still to do
-add parameter for ID and username (usertype)
'''import scraperwiki
import simplejson
import urllib2
from scraperwiki import swimport

myfollowers = []
twitter_handle = 'GotToDance_Sky1'
コード例 #30
0
from scraperwiki import swimport
swimport('twitter_search').search(['@thomaslevine', 'from:thomaslevine'])from scraperwiki import swimport
swimport('twitter_search').search(['@thomaslevine', 'from:thomaslevine'])
コード例 #31
0
from scraperwiki import swimport
csv2dict = swimport('csv2sw').csv2dict

# Write a csv
f = open('foobar.csv', 'w')
f.write('foo,bar,baz\n4,63,3')
f.close()

# Convert it to a dictionary
g = open('foobar.csv', 'r')
d = csv2dict(g)
print dfrom scraperwiki import swimport
csv2dict = swimport('csv2sw').csv2dict

# Write a csv
f = open('foobar.csv', 'w')
f.write('foo,bar,baz\n4,63,3')
f.close()

# Convert it to a dictionary
g = open('foobar.csv', 'r')
d = csv2dict(g)
print d
コード例 #32
0
import scraperwiki

search = scraperwiki.swimport('twitter_search_extended').search

search(['olympics'], num_pages=5)



import scraperwiki

search = scraperwiki.swimport('twitter_search_extended').search

search(['olympics'], num_pages=5)



コード例 #33
0
"Because this website's data span many pages, this scraper needs to be run about nine times to make a full scrape."

from lxml.html import fromstring
#from lxml.etree import fromstring
from time import time
import requests
from scraperwiki.sqlite import save,save_var,get_var,show_tables,select,commit,execute
from scraperwiki import swimport
options=swimport('options').options
keyify=swimport('keyify').keyify
from json import loads,dumps
strip_address = swimport('strip_address').strip_address

URLS={
  "main":"http://www.nedbank.co.za/website/content/map/branches.asp"
, "suburbs-base":"http://www.nedbank.co.za/website/content/map/getSuburbs.asp?q="
, "cities-base":"http://www.nedbank.co.za/website/content/map/getData.asp?q="
}

def main():
  if get_var('province')=='step2':
    separate_addresses()
    execute('DELETE FROM swvariables WHERE name = "province"')
    commit()
    print("""
    ================================
    This run is finished!
    ================================
    """)
  else:
    download()
コード例 #34
0
from scraperwiki import swimport
bucketwheel = swimport('bucketwheel')
from scraperwiki.sqlite import execute, commit

execute('drop table if exists stack')
commit()
bucketwheel.seed([bucketwheel.GetLinks('http://thomaslevine.com')])from scraperwiki import swimport
bucketwheel = swimport('bucketwheel')
from scraperwiki.sqlite import execute, commit

execute('drop table if exists stack')
commit()
bucketwheel.seed([bucketwheel.GetLinks('http://thomaslevine.com')])
コード例 #35
0
from lxml.html import fromstring
from urllib2 import urlopen
from time import time
import re

from scraperwiki.sqlite import save, select
from scraperwiki import swimport
randomsleep=swimport('randomsleep').randomsleep

DATE=time()
DOMAIN="http://www.bk.rw"
STARTURL="http://www.bk.rw/index.php?option=com_content&view=article&id=54&Itemid=57"

def download():
  url=STARTURL
  d=[]
  while url!=None:
    a=Article(url)
    d.extend(a.parse())
    url=a.next()
    randomsleep()
  save(['date-scraped','branch-number'],d,'raw')

class MultipleNextButtons(Exception):
  pass

class Article:
  def __init__(self,url):
    self.url=url
    self.x=fromstring(urlopen(url).read())
コード例 #36
0
from requests import get
from string import ascii_lowercase
from lxml.html import fromstring
from scraperwiki import swimport
from scraperwiki.sqlite import save,select,get_var,save_var,execute
keyify=swimport('keyify').keyify
from time import time
import re

DATE=time()

def main():
  if None==get_var('downloaded'):
    download()
    save_var('downloaded',1)
  execute('DROP TABLE IF EXISTS `final`')
  clean()
  save_var('downloaded',None)

def clean():
  d=select('* FROM `swdata` WHERE `date_scraped`=(SELECT max(`date_scraped`) FROM `swdata`);')
  POSTCODE=re.compile(r'[0-9]*$')
  for row in d:
    row['postcode']=re.findall(POSTCODE,row['Postal_Address'])[0]
  save([],d,'final')

def download():
  d=[]
  for letter in ascii_lowercase:
    x=search_letter(letter)
    branch_tables=x.cssselect('table.locatorTable table')
コード例 #37
0
import scraperwiki
import urllib

pdf2xml = scraperwiki.swimport("pdf2xml")
pdfurl = "http://samplepdf.com/sample.pdf"

# example PDF
pdfurl = 'http://samplepdf.com/sample.pdf'
pdf = urllib.urlopen(pdfurl).read()

   # The C executable:
print scraperwiki.pdftoxml(pdf)
   # The PDFminder version from https://github.com/zejn/pypdf2xml/blob/master/pdf2xml
print pdf2xml.pdf2xml(pdf)

# Notes: 
#  It may be that is better delivered as a scraperwiki view where you pass in the url as an argument, and the view 
#  downloads and parses it, so it can be used in all languages.  
#  Disadvantage is you don't get to hold and be able to cache the PDF source itself.  

# The software version is better at "handling non-ascii or otherwise custom glyphs, which supplied glyph to
# character maps."  -- [example?]

# Also it is my (Julian_Todd's) intention to get a version that preserves borders of table cells, which nothing does.  
コード例 #38
0
from scraperwiki import swimport
from scraperwiki.sqlite import save, select, execute, commit
eventbrite = swimport('eventbrite')

def union(year, eventbrite_name, table_name):
    eventbrite.scrape('http://%s.eventbrite.com' % eventbrite_name, table_name)
    d = select('* from `%s`' % table_name)
    for row in d:
        row['year'] = year
    save([], d, 'tcamp')

def download():
    execute('CREATE TABLE IF NOT EXISTS `tcamp`'
        '(`year` integer, `first_scraped` real, `Twitter handle` text, `Intro for your fellow campers` text)')
    execute('DELETE FROM tcamp')
    union(2013, 'tcamp13', '2013')
    union(2012, 'tcamp12', '2012')
    union(2011, 'tcamp11', '2011')

def aggregate():
    execute('create table if not exists twitter (handle text, times integer)')
    execute('create unique index if not exists twitter_handle on twitter(handle)')
    execute('delete from twitter where 1 = 1')
    execute('insert into twitter '
        'select replace(`twitter handle`, "@", ""), count(*) from `tcamp` '
        'where `twitter handle` is not null group by [twitter handle]'
    )
    commit()

download()
aggregate()from scraperwiki import swimport
コード例 #39
0
from scraperwiki import swimport
from scraperwiki.sqlite import save
dsp=swimport('dsp').dsp
from time import time

DATE=time()

#Load
xml=dsp('http://www.tebabank.co.za/dist_atm.php',False)

#Parse
towns=xml.xpath('//ul[@class="bulletlist"]')

for town in towns:
  townnames=town.xpath('preceding-sibling::h2[position()=1]/text()')
  l=len(townnames)
  if l!=1:
    raise ParseError("There is supposed to be exactly one town name, but %d were found." % l)
  else:
    townname=townnames[0]

  addresses=town.xpath('li/text()')

  #Save
  for address in addresses:
    save([],{
      "town":townname
    , "address":address
    , "date_scraped":DATE
    },'swdata')
from scraperwiki import swimport
import scraperwiki
import simplejson
import urllib2
from scraperwiki import swimport

myfollowers = []
twitter_handle = 'urban_nerds'

base_url = 'https://api.twitter.com/1/followers/ids.json?cursor=-1&screen_name=' + twitter_handle 
results_json = simplejson.loads(scraperwiki.scrape(base_url))
myfollowers = results_json['ids']
myfollowers_str = map(str, myfollowers) 



swimport('twitter_bulk_users_lookup').bulklookup(myfollowers_str)


'''
See https://scraperwiki.com/scrapers/twitter_bulk_users_lookup/ for the code for the script

Still to do
-add parameter for ID and username (usertype)
'''import scraperwiki
import simplejson
import urllib2
from scraperwiki import swimport

myfollowers = []
twitter_handle = 'urban_nerds'
コード例 #41
0
# Blank Python

from scraperwiki import swimport
swimport('twitter_search').search(['@dmsilvabr', 'from:dmsilvabr'])# Blank Python

from scraperwiki import swimport
swimport('twitter_search').search(['@dmsilvabr', 'from:dmsilvabr'])
コード例 #42
0
ファイル: scraper.py プロジェクト: backgroundcheck/dftgovuk2
import scraperwiki
import lxml.html
import requests
import xlrd, re
import dateutil.parser
import urlparse
import json

html2text = scraperwiki.swimport('html2text')

xlurl = 'https://dl.dropbox.com/s/ugyauhxhs3xuf1z/dft_scraping_instructions_new.xls?dl=1'  #'dft scraping instructions new' doc emailed by dragon on sep 12.


# julians xls loader
def LoadXLlinks():
    book = xlrd.open_workbook(
        file_contents=requests.get(xlurl, verify=False).content)
    for i in [0, 2, 4, 6, 8, 10, 12]:
        sheet = book.sheet_by_index(i)
        print sheet.name, "rows:", sheet.nrows
        ldata = []
        for i in range(sheet.nrows):
            sheetvalue = sheet.cell(i, 0).value.strip()
            if sheetvalue:
                ldata.append({
                    "sheetname": sheet.name,
                    "i": i,
                    "url": sheetvalue
                })
        scraperwiki.sqlite.save(["sheetname", "i"], ldata, "xllinks")
コード例 #43
0
  from scraperwiki.sqlite import save,get_var,save_var
except ImportError:
  def save(a,b,c):
    print(b)

  __VARS={}
  def get_var(a):
    if __VARS.has_key(a):
      return __VARS[a]
  def save_var(a,b):
    __VARS[a]=b

  def options(*args,**kwargs):
    return [{"branchId":"174","branchName":"DUNNO"}]
else:
  options=swimport('options').options

URL="http://www.postbank.co.za/contact.aspx?ID=3"

def log(foo):
  print(foo)

if get_var('previous_branchId')==None:
  save_var('DATE',time())
  FIRST_RUN=True
else:
  FIRST_RUN=False

DATE=get_var('DATE')

def main():
コード例 #44
0
import scraperwiki

from scraperwiki import swimport

search = swimport('twitter_search').search
search(['from:peterwalker99'])

#print search

import scraperwiki

from scraperwiki import swimport

search = swimport('twitter_search').search
search(['from:peterwalker99'])

#print search
コード例 #45
0
from scraperwiki import swimport
swimport('eb').scrape('http://jdcny.eventbrite.com/')from scraperwiki import swimport
swimport('eb').scrape('http://jdcny.eventbrite.com/')from scraperwiki import swimport
swimport('eb').scrape('http://jdcny.eventbrite.com/')from scraperwiki import swimport
swimport('eb').scrape('http://jdcny.eventbrite.com/')
コード例 #46
0
import scraperwiki
import sys
import urllib
import lxml.etree
import re
import datetime

wikipedia_utils = scraperwiki.swimport("wikipedia_utils")

nowtime = datetime.datetime.now()


def UpdateCategory():
    ldata = wikipedia_utils.GetWikipediaCategoryRecurse(
        "Caves_of_the_United_Kingdom")
    for data in ldata:
        data["updatedate"] = nowtime
    scraperwiki.sqlite.save(["title"], ldata, "cavepages")
    scraperwiki.sqlite.execute("delete from cavepages where updatedate<>?",
                               nowtime.isoformat())


def ExtractInfo():
    rdata = scraperwiki.sqlite.select("title, link from cavepages")
    ldata = []
    for pdata in rdata:
        try:
            val = wikipedia_utils.GetWikipediaPage(pdata["title"])
        except IOError:
            print "Skipping", pdata["title"]
            continue
from scraperwiki import swimport 

swimport('twitter_search_extended').search(['multiple sclerosis'], num_pages = 100)from scraperwiki import swimport 

swimport('twitter_search_extended').search(['multiple sclerosis'], num_pages = 100)
コード例 #48
0
from scraperwiki import swimport

swimport('twitter_search_extended').search(['Bosone di Higgs'], num_pages=100)
from scraperwiki import swimport

swimport('twitter_search_extended').search(['Bosone di Higgs'], num_pages=100)
コード例 #49
0
import scraperwiki
import lxml.html
import requests
#import xlrd, re
import dateutil.parser
#import urlparse
#import json
from hashlib import sha1
from random import random
import re

date_check = re.compile("\d+ \S+ \d\d\d\d")



html2text=scraperwiki.swimport('html2text')

# julians xls loader
def LoadXLlinks():
    book = xlrd.open_workbook(file_contents=requests.get(xlurl,verify=False).content)
    for i in [0,2,4,6,8,10,12]:
        sheet = book.sheet_by_index(i)
        print sheet.name, "rows:", sheet.nrows
        ldata = [ ] 
        for i in range(sheet.nrows): 
            sheetvalue = sheet.cell(i, 0).value.strip()
            if sheetvalue:
                ldata.append({"sheetname":sheet.name, "i":i, "url":sheetvalue})
        scraperwiki.sqlite.save(["sheetname", "i"], ldata, "xllinks")

# julians raw scraper
コード例 #50
0
import scraperwiki
import lxml.html
lazycache=scraperwiki.swimport('lazycache')

#url='http://www.mod.uk/DefenceInternet/AboutDefence/CorporatePublications/ConsultationsandCommunications/PublicConsultations/GpssConsultation.htm'
url='http://www.mod.uk/DefenceInternet/DefenceNews/DefencePolicyAndBusiness/FreeTravelForMilitaryPersonnelSupportingSecurityForThe2012Games.htm'
html=lazycache.lazycache(url)
root=lxml.html.fromstring(html)
root.make_links_absolute(url)
print html

def parsenews(root):
    data={}
    divs= root.xpath("//div[@id='left-column']/div[not(@class)]/p[1]/..")
    for x in divs:
        print x.attrib, lxml.html.tostring(x),'*'

parsenews(root)


def parseconsult(root):
    data={}
    data['title']=root.cssselect('h1')[0].text
    #data['body']=lxml.html.tostring(root.xpath("//div[@class='consultationcontent'][2]")[0])
    data['ref'] = root.xpath("//div[@property='dc:identifier']/text()")[0]
    data['assoc_org']='MoD'
    
    data['attachments']=[]
    for attachment in root.xpath("//a[@rel='DC:haspart']"):
        url=attachment.attrib['href']
        name=attachment.text
コード例 #51
0
from scraperwiki.sqlite import save, select
from scraperwiki import swimport
from requests import session
from lxml.html import fromstring, tostring
import re
from time import time
keyify=swimport('keyify').keyify
randomsleep=swimport('randomsleep').randomsleep

URL="http://www.ncr.org.za/register_of_registrants/index.php"

#DEV=True
DEV=False

DATE=time()

RE={
  'leftpadding':re.compile(r'^ *')
, 'rightpadding':re.compile(r' *$')
}

def headquarters():
  cp1()
  #dc_and_cb1()

def cp1():
  p=Page('CP1')

  while p.lastpage()==False:
    tables=p.table().subtables()
    d = []
コード例 #52
0
from scraperwiki import swimport 

swimport('twitter_search_extended').search(['politikk', 'kommunikasjon'])from scraperwiki import swimport 

swimport('twitter_search_extended').search(['politikk', 'kommunikasjon'])
コード例 #53
0
# -*- coding: UTF-8 -*-

import scraperwiki
import json
from BeautifulSoup import BeautifulSoup
import datetime
import dateutil.parser
import lxml.html
import resource
import sys
import urlparse
import re
lazycache=scraperwiki.swimport('lazycache')
postlistelib=scraperwiki.swimport('postliste-python-lib')

agency = 'Luftambulansetjenesten ANS'

def report_errors(errors):
    if 0 < len(errors):
        print "Errors:"
        for e in errors:
            print e
        exit(1)
def out_of_cpu(arg, spent, hard, soft):
    report_errors(arg)

def process_pdf(parser, pdfurl, errors):
    errors = []
    postlistelib.exit_if_no_cpu_left(0, out_of_cpu, errors)
    try:
        pdfcontent = scraperwiki.scrape(pdfurl)
コード例 #54
0
def load_disclosures():
  csv=urlopen(URL)
  d=swimport('csv2sw').csv2json(csv)
  save([],d,'disclosures')
コード例 #55
0
import scraperwiki

from scraperwiki import swimport

search = swimport('twitter_search').search
search(['from:random_robbie'])

#print search



コード例 #56
0
import scraperwiki
import json
import requests
import lxml.html
import itertools
import datetime

swutils = scraperwiki.swimport('swutils')
user_agent = swutils.get_user_agent()

requests.defaults.defaults['max_retries'] = 10


def main():

    print 'scraping November 2012 json feed'

    json_url = "http://ec2-46-51-135-144.eu-west-1.compute.amazonaws.com/gaza/Nov2012/json"
    json_response = requests.get(json_url).text
    json_dict = json.loads(json_response)

    batch = []
    alreadygot = []
    try:
        rows = scraperwiki.sqlite.select('id from features')
        for row in rows:
            alreadygot.append(row['id'])
    except:
        pass

    if len(json_dict['features']) - len(alreadygot) > 0: