コード例 #1
0
def main():
    if get_var('columns_to_do') == None:
        columns = COLUMNS
    else:
        columns = loads(get_var('columns_to_do'))

    while len(columns) > 0:
        column = columns[0]
        d = load_data(column)
        out = []
        for row in d:
            p = Place(row[column], (row['latitude'], row['longitude']))
            row_geocode = p.geocode()
            row_geocode.update({
                "address-column": column,
                "branchId": row['branchId']
            })
            out = row_geocode
            sleep(3)
            save([], out, 'geocoded')
        columns.remove(column)

        if len(columns) == 0:
            save_var('columns_to_do', None)
        else:
            save_var('columns_to_do', dumps(columns))
def main():
  if get_var('columns_to_do') == None:
    columns = COLUMNS
  else:
    columns = loads(get_var('columns_to_do'))

  while len(columns) > 0:
    column = columns[0]
    d = load_data(column)
    out = []
    for row in d:
      p = Place(row[column], (row['latitude'], row['longitude']) )
      row_geocode = p.geocode()
      row_geocode.update({
        "address-column":column,
        "branchId": row['branchId']
      })
      out = row_geocode
      sleep(3)
      save([], out, 'geocoded')
    columns.remove(column)

    if len(columns) == 0:
      save_var('columns_to_do',None)
    else:
      save_var('columns_to_do',dumps(columns))
コード例 #3
0
def main():
  if get_var('step')==None:
    save_var('step',0)
  while get_var('step')!=None:
    if get_var('step')==0:
      download()
      save_var('step',1)
    elif get_var('step')==1:
      moreparsing_map()
      save_var('step',2)
    else:
      #Scraper is finished; reset
      save_var('step',None)
def atomic():
  if "client"==pagetype(get_var('previous_href')):
    table_names=CLIENT_TABLES
  elif "lobbyist"==pagetype(get_var('previous_href')):
    table_names=LOBBYIST_TABLES
  else:
    raise ResumeError('The type of the previous href, "%s", could not be determined.' % get_var('previous_href'))

  if "clients_urls" in show_tables():
    sourceUrl=select('distinct sourceUrl as "s" from `clients_urls` where jobId=(select max(jobId) from `clients_urls`)')[0]['s']
    for table_name in table_names:
      execute('DELETE FROM `%s` where jobId in (select jobId from clients_urls where sourceUrl="%s")' % (table_name,sourceUrl))
    commit()
    return sourceUrl
コード例 #5
0
def getroutes():
    skip = get_var('skip')
    json = urlopen(
        "http://coach.iriscouch.com/routes/_design/coach/_view/fullRoutes?skip=%d&limit=%d"
        % (skip, skip + 1000)).read()
    table = loads(json)['rows']
    return table
コード例 #6
0
def main():
  if get_var('skip')==None:
    save_var('skip',0)
  routesTable=getroutes()
  for row in routesTable:
    if row['key'][0:2]!=row['key'][2:4]:
      get_route_schedules(row['id'],row['key'])
コード例 #7
0
def main():
  if None==get_var('DATE'):
    save_var('DATE',time())

  searchTerms=get_searchTerms()
  for searchTerm in searchTerms:
    d=paginate(searchTerm)
    for row in d:
      row['date_scraped']=get_var('DATE')
      row['searchTerm']=searchTerm

    save_var('previous_searchTerm',searchTerm)
    save(['date_scraped', 'Name'],d,'initial')

  save_var('previous_searchTerm',None)
  save_var('DATE',None)
コード例 #8
0
def shallow_scrape():
    br = mechanize.Browser()

    c = sqlite.get_var("last_page", 0) + 1
    max_c = c + 6

    resultspage = br.open("http://www.education.gov.uk/edubase/quickSearchResult.xhtml?page=%d" % c)

    while c < max_c:
        print ""
        print "Handling page %d..." % c
        print "  [" + br.geturl() + "]"

        ### extract data from page
        page = html.parse(resultspage)

        for u in page.getroot().findall("body/div/div/div/div/table/tr/td/table/tbody/tr/td/a"):
            urn = re.search("urn=([0-9]{6})", u.get("href")).group(1)
            yield urn

        ### get new page
        try:
            resultspage = br.follow_link(text="Next")
            sqlite.save_var("last_page", c)

            c += 1
            if c % 2 == 0:
                time.sleep(10)

        except mechanize.LinkNotFoundError:
            c += 1
            sqlite.save_var("last_page", 0)
            break
コード例 #9
0
def get_searchTerms():
  searchTerm=get_var('previous_searchTerm')
  if searchTerm==None:
    i=0
  else:
    i=ascii_lowercase.index(searchTerm)+1
  return ascii_lowercase[i:]
コード例 #10
0
def main():
    #finalpage=get_var('finalpage')
    prevpage = get_var('prevpage')

    #if None==finalpage:
    if True:
        finalpage = int(get_lastpage(getpage(1)))
        save_var('finalpage', finalpage)
    if None == prevpage:
        prevpage = 1

    if prevpage < finalpage:
        step1(prevpage, finalpage)
    elif prevpage == finalpage:
        if not "step2completion" in show_tables():
            execute(
                'create table `step2completion` (`url` text, `browsed` boolean)'
            )
            execute("""
        INSERT INTO `step2completion`
        ( url , browsed )
        SELECT url, 0 as "browsed"
        FROM locations
        """)
            commit()
        step2()
コード例 #11
0
def main():
  if None==get_var('downloaded'):
    download()
    save_var('downloaded',1)
  execute('DROP TABLE IF EXISTS `final`')
  clean()
  save_var('downloaded',None)
コード例 #12
0
def main():
    if get_var('skip') == None:
        save_var('skip', 0)
    routesTable = getroutes()
    for row in routesTable:
        if row['key'][0:2] != row['key'][2:4]:
            get_route_schedules(row['id'], row['key'])
def main():
    foo=get_var('runId')
    runId=1 if foo==None else foo+1
    save_var('runId',runId)
    try:
        nonsense()
    except:
        try:
            nonsense()
        except:
            exceeded(runId)
コード例 #14
0
def main():
    foo = get_var('runId')
    runId = 1 if foo == None else foo + 1
    save_var('runId', runId)
    try:
        nonsense()
    except:
        try:
            nonsense()
        except:
            exceeded(runId)
コード例 #15
0
def main():
    if get_var('province') == 'step2':
        separate_addresses()
        execute('DELETE FROM swvariables WHERE name = "province"')
        commit()
        print("""
    ================================
    This run is finished!
    ================================
    """)
    else:
        download()
コード例 #16
0
def main():
  if get_var('province')=='step2':
    separate_addresses()
    execute('DELETE FROM swvariables WHERE name = "province"')
    commit()
    print("""
    ================================
    This run is finished!
    ================================
    """)
  else:
    download()
コード例 #17
0
def download(abridge=False):
    d = []

    #Resume the saved provinces
    provinces = getprovinces()
    province = get_var('province', provinces[0])

    #Put the date in. This will get passed along, so this is the only time I add it.
    province['date_scraped'] = get_var('DATE', int(time()))

    #Get the cities
    cities = getcities(province['provinceId'])

    for city in cities:
        #Pass along the province
        city.update(province)

        branches = getbranches_with_info(city['cityId'])
        for branch in branches:
            #print branch
            branch.update(city)
            d.append(branch)

        if abridge:
            break

    i = provinces.index(province) + 1
    print provinces
    if i < len(provinces):
        save_var('province', dumps(provinces[i]))
        print('Finished with branches in %s' % province['provinceName'])
    else:
        save_var('province', None)
        print('Finished with all the downloading!')

    save([], d, 'initial')
コード例 #18
0
def download(abridge=False):
  d=[]

  #Resume the saved provinces
  provinces=getprovinces()
  province=get_var('province', provinces[0])

  #Put the date in. This will get passed along, so this is the only time I add it.
  province['date_scraped']=get_var('DATE', int(time()))

  #Get the cities
  cities=getcities(province['provinceId'])

  for city in cities:
    #Pass along the province
    city.update(province)

    branches=getbranches_with_info(city['cityId'])
    for branch in branches:
      #print branch
      branch.update(city)
      d.append(branch)

    if abridge:
      break

  i=provinces.index(province)+1
  print provinces
  if i<len(provinces):
    save_var('province',dumps(provinces[i]))
    print('Finished with branches in %s' % province['provinceName'])
  else:
    save_var('province',None)
    print('Finished with all the downloading!')

  save([],d,'initial')
コード例 #19
0
def cp1():
  execute('''
CREATE TABLE IF NOT EXISTS `businessPremises` (
  `date_scraped` REAL,
  `businessPremisesURL` TEXT,
  FOREIGN KEY (date_scraped, businessPremisesUrl)
  REFERENCES cp1(date_scraped, businessPremisesUrl)
)
''')

  if get_var('crashed') == 1:
    pagenum = select('max(pagenum) from cp1 where date_scraped = (select max(date_scraped) from cp1)')[0]['max(pagenum)']
    print "Resuming from page %d" % pagenum
    p = Page('CP1')
    p = Page('CP1', s=p.s, pagenum=pagenum)
  else:
    print "Starting a new run"
    p = Page('CP1')

  while p.lastpage()==False:
    print "Beginning page %d" % p.pagenum
    tables=p.table().subtables()
    d = []
    for table in tables:
        row = table.parse()
        row['businessPremisesURL'] = table.business_premises_url()

        try:
            business_premises_data, more_registrant_data = table.business_premises(p.s)
        except Exception, msg:
            print "Error on %s: msg" % table.business_premises_url()
            sleep(60)
            print "Trying again"
            business_premises_data, more_registrant_data = table.business_premises(p.s)

        row['date_scraped']=DATE
        row['pagenum']=p.pagenum
        row['url']=URL+"?page=%d"%p.pagenum

        row.update(more_registrant_data)

        save([], business_premises_data, 'businessPremises')
        save(['date_scraped', 'businessPremisesURL'],row,'cp1')

        sleep(1)
    save_var('crashed', 1)
    p=p.next25()
コード例 #20
0
def get_route_schedules(routeId,route):
  #Check that it's not a route within one city
  assert route[0:2]!=route[2:4]

  xml,theurl=grab(route)
  save(['routeId','url'],{
    "routeId":routeId
  , "url":theurl
  },'urls')

  try:
    table=get_table(xml)
  except:
    save([],{"url":theurl},'errors')
  else:
    d_raw=parse_table(table)
    d=[]

    for row_raw in d_raw:
      row_clean={}
      for key in row_raw:
        if key==":Route/Trip":
          row_clean['routeNum']=row_raw[key]
        else:
          foo,bar,baz=key.split(':')
          if foo=="From":
            row_clean['fromCity']=bar
            row_clean['fromStop']=baz
            row_clean['fromTime']=row_raw[key]
          elif foo=="To":
            row_clean['toCity']=bar
            row_clean['toStop']=baz
            row_clean['toTime']=row_raw[key]
      row_clean['routeId']=routeId

      if row_clean['toStop']=='megabus.com stop' and row_clean['fromStop']=='megabus.com stop':
        table_name='megabus'
      else:
        table_name='schedules'

      save([],row_clean,table_name)
    save_var('skip',get_var('skip')+1)
コード例 #21
0
def get_route_schedules(routeId, route):
    #Check that it's not a route within one city
    assert route[0:2] != route[2:4]

    xml, theurl = grab(route)
    save(['routeId', 'url'], {"routeId": routeId, "url": theurl}, 'urls')

    try:
        table = get_table(xml)
    except:
        save([], {"url": theurl}, 'errors')
    else:
        d_raw = parse_table(table)
        d = []

        for row_raw in d_raw:
            row_clean = {}
            for key in row_raw:
                if key == ":Route/Trip":
                    row_clean['routeNum'] = row_raw[key]
                else:
                    foo, bar, baz = key.split(':')
                    if foo == "From":
                        row_clean['fromCity'] = bar
                        row_clean['fromStop'] = baz
                        row_clean['fromTime'] = row_raw[key]
                    elif foo == "To":
                        row_clean['toCity'] = bar
                        row_clean['toStop'] = baz
                        row_clean['toTime'] = row_raw[key]
            row_clean['routeId'] = routeId

            if row_clean['toStop'] == 'megabus.com stop' and row_clean[
                    'fromStop'] == 'megabus.com stop':
                table_name = 'megabus'
            else:
                table_name = 'schedules'

            save([], row_clean, table_name)
        save_var('skip', get_var('skip') + 1)
def jobs(limit=2):
  print("Scraping individual job information")
  previous_href=get_var('previous_href',verbose=False)

  if previous_href==None:
    hrefs=[row['href'] for row in select('href FROM links ORDER BY href LIMIT %d' % limit,verbose=False)]

  else:
    hrefs=getnexthrefs(limit,previous_href)
    previous_url=atomic()

    if MORE_QUERY_STRING['base'] in previous_url:
      print "Resuming from %s" % previous_url
      url,startpage_str=re.split(MORE_QUERY_STRING['re'],previous_url)
      href_resume=url.replace('http://www.nyc.gov/lobbyistsearch/','')
      startpage=int(startpage_str)
      paginate_result(href_resume,startpage=startpage)
      randomsleep()

  for href in hrefs:
    paginate_result(href,startpage=1)
    randomsleep()
  save_var('previous_href',hrefs[-1],verbose=False)
コード例 #23
0
def shallow_scrape():
    br = mechanize.Browser()

    c = sqlite.get_var('last_page', 0) + 1
    max_c = c + 6

    resultspage = br.open(
        "http://www.education.gov.uk/edubase/quickSearchResult.xhtml?page=%d" %
        c)

    while c < max_c:
        print ""
        print "Handling page %d..." % c
        print "  [" + br.geturl() + "]"

        ### extract data from page
        page = html.parse(resultspage)

        for u in page.getroot().findall(
                "body/div/div/div/div/table/tr/td/table/tbody/tr/td/a"):
            urn = re.search("urn=([0-9]{6})", u.get("href")).group(1)
            yield urn

        ### get new page
        try:
            resultspage = br.follow_link(text="Next")
            sqlite.save_var('last_page', c)

            c += 1
            if c % 2 == 0:
                time.sleep(10)

        except mechanize.LinkNotFoundError:
            c += 1
            sqlite.save_var('last_page', 0)
            break
コード例 #24
0
from scraperwiki.sqlite import save, select, execute, save_var, get_var, commit
from scraperwiki import swimport
from requests import session
from lxml.html import fromstring, tostring
import re
from time import time, sleep
keyify=swimport('keyify').keyify

URL="http://www.ncr.org.za/register_of_registrants/index.php"

#DEV=True
DEV=False

DATE = get_var('DATE', time())

RE={
  'leftpadding':re.compile(r'^ *')
, 'rightpadding':re.compile(r' *$')
}

def cp1():
  execute('''
CREATE TABLE IF NOT EXISTS `businessPremises` (
  `date_scraped` REAL,
  `businessPremisesURL` TEXT,
  FOREIGN KEY (date_scraped, businessPremisesUrl)
  REFERENCES cp1(date_scraped, businessPremisesUrl)
)
''')

  if get_var('crashed') == 1:
コード例 #25
0
def select_branchIds(branches):
  branchIds=[unicode(branch['branchId']) for branch in branches]
  previous_branchId=get_var('previous_branchId')
  branchIds.sort()
  i=branchIds.index(previous_branchId)
  return branchIds[i:]
コード例 #26
0
    if __VARS.has_key(a):
      return __VARS[a]
  def save_var(a,b):
    __VARS[a]=b

  def options(*args,**kwargs):
    return [{"branchId":"174","branchName":"DUNNO"}]
else:
  options=swimport('options').options

URL="http://www.postbank.co.za/contact.aspx?ID=3"

def log(foo):
  print(foo)

if get_var('previous_branchId')==None:
  save_var('DATE',time())
  FIRST_RUN=True
else:
  FIRST_RUN=False

DATE=get_var('DATE')

def main():
  b=PostbankBrowser()
  branches=b.get_branch_list()
  if FIRST_RUN:
    save_branches(branches)

  for branchId in select_branchIds(branches):
    b.load_branch(branchId)
コード例 #27
0
import json
import requests
import scraperwiki.sqlite as db
import time

begin = 1

counciltype = json.loads(requests.get('http://mapit.mysociety.org/areas/LBO').content)
for council, data1 in counciltype.items():
    if(db.get_var('id') == council and begin == 0):
        begin = 1
    if(begin == 1):
        print data1['name']
        db.save_var('id', council)
        children = json.loads(requests.get('http://mapit.mysociety.org/area/%s/children' % council).content)
        for id, data in children.items(): 
                #time.sleep(1)
                json.loads(requests.get('http://mapit.mysociety.org/area/%s' % id).content)
                if (data['type'] == 'LBW'):            
                    #time.sleep(0.1)
                    kml = requests.get('http://mapit.mysociety.org/area/%s.kml' % id).content
                    councildata = {'type': data['type'],
                                   'parent_name': data1['name'],
                                   'id': int(id),
                                   'name': data['name'],
                                   'kml': kml[85:-7]}
                    db.save(['id'], councildata, verbose=0)import json
import requests
import scraperwiki.sqlite as db
import time
コード例 #28
0
from scraperwiki.sqlite import save,get_var,save_var
from lxml import html

URL = "http://www.e-ships.net/new/?View=ShipSearchResult"
URL += "&ship_name=&fdwt=&tdwt=&last_ex_name=&fgt=&tgt=&imo=&fnrt=&tnrt=&ship_type=-1&fteu=&tteu=&"
URL += "flag=-1&floa=&tloa=&ship_class=-1&fbeam=&tbeam=&call_sign=&fdraft=&tdraft=&owner_id="
URL += "&fbuilt=&tbuilt=&manager_id=&fengine_kw_total=&tengine_kw_total=&builder_id=&fengine_hp_total="
URL += "&tengine_hp_total=&sortby=ship_name&p=%s"

i=get_var('page')
if i==None:
  i=0
while i<=1174:
    doc = html.parse(URL % i).getroot()
    rows = doc.xpath('//tr')
    if len(rows) == 1:
        break

    d=[]
    for row in rows:
      link = row.find('td/a')
      if link is None or not 'ShipDetails' in link.get('href'):
          continue
      number, name, type, dwt, built, flag, _ = map(lambda c: c.text, row)
      
      d.append({
        'number': number,
        'name': name, 
        'type': type,
        'dwt': dwt,
        'built': built,
from urllib2 import urlopen
from scraperwiki import pdftoxml
from scraperwiki.sqlite import save_var, get_var
from lxml.etree import fromstring, tostring
from unidecode import unidecode

#pdfxml = pdftoxml(urlopen('http://www.hydepark.org/schools/HPKCC%20Youth%20Programs%20Database-%20Version%203.pdf').read())
#save_var('pdfxml', unidecode(pdfxml))

pdfxml = get_var('pdfxml')
x = fromstring(pdfxml)
for page in x.xpath('//page'):
    bs = page.xpath('descendant::b/text()')
    if len(bs) == 10 and bs[0] == 'HPKCC Youth Programs Database':
        del (bs[0])

    if len(bs) != 9:
        raise ValueError('Wrong number of bold text boxes')
    elif bs[0:7] != [
            'Program Name', 'Program Desc.', 'Program Website',
            'Program Address', 'Contact Name', 'Contact Number',
            'Contact Email'
    ]:
        raise ValueError('Wrong table keys')

    updated, pagenumber = bs[7:9]

    lefts = map(int, list(set(page.xpath('text/@left'))))
    lefts.sort()
    for left in lefts:
        print[
コード例 #30
0
def resume(levels=(4, 3, 2, 1)):
    """Resume an incomplete scrape."""
    for level in levels:
        js = get_var(str(level))
        if js != None:
            resume_siblings(js, level)
コード例 #31
0
from scraperwiki import scrape
from scraperwiki.sqlite import save,get_var
from urllib2 import urlopen
from lxml.html import fromstring
from datetime import *

part1 = 'http://wwe1.osc.state.ny.us/transparency/contracts/contractresults.cfm?PageNum_rsContract='
part2 = '&sb=a&searchBy=&a=Z0000&au=0&ac=&v=%28Enter+Vendor+Name%29&vo=B&cn=&c=-1&m1=0&y1=0&m2=0&y2=0&am=0&b=Search&entitytype=Agency&order=PAYEE_NAME&sort=ASC'

start_page=get_var('start_page')
if start_page==None:
    start_page=1

urlstrings = [ part1 + str(i) +part2 for i in range(start_page,992)]
headers = [
  'Vendor','Agency','Contract_Number','Current_Contract_Amount',
  'Spending_to_Date','Contract_Start_Date','Contract_End_Date',
  'Contract_Description','Contract_Type', 'Contract_Approval_Date'
]

for urlstring in urlstrings:
    page_data=scrape(urlstring)
    page_data=fromstring(page_data).cssselect('#tableData tr')
    dict_rows=[]
    for row in page_data:
        dict_row=dict(zip(headers, [cell.text_content().strip()  for cell in row.cssselect('td') if cell.text_content().strip() != None]))
        dict_row['url']=urlstring
        if dict_row:
            try:
                dict_row['Current_Contract_Amount']=float(dict_row.get('Current_Contract_Amount','').replace('$', '').replace(',', '').replace('(', '-').replace(')', ''))
                dict_row['Spending_to_Date']=float(dict_row.get('Spending_to_Date','').replace('$', '').replace(',', '').replace('(', '-').replace(')', ''))
 def gen_urls():
     for skip in xrange(first_skip, last_skip + SKIP_STEP, SKIP_STEP):
         url = '%s&skip=%s' % (start_url, skip)
         if not db.get_var(url, 0, verbose=0):
             yield url
コード例 #33
0
            row.update({'date_scraped': time(), 'ScraperRun': scraper_run, 'url': self.url, 'Record': int(self.url.split('=')[-1])})
            data.append(row)

        save([], data, 'BusinessPremises')

execute('CREATE TABLE IF NOT EXISTS Registrant (ScraperRun INTEGER, Record INTEGER)')
execute('CREATE INDEX IF NOT EXISTS RegistrantRecord ON Registrant(record)')
execute('CREATE TABLE IF NOT EXISTS BusinessPremises (ScraperRun INTEGER, Record INTEGER, FOREIGN KEY(Record) REFERENCES Registrant(Record))')
execute('CREATE INDEX IF NOT EXISTS BusinessPremisesRecord ON BusinessPremises(ScraperRun, Record)')
commit()

if "stack" not in show_tables() or select('count(*) as "c" from stack')[0]['c'] == 0:
    save_var('scraper_run', int(time()))

scraper_run = get_var('scraper_run', None)
if scraper_run == None:
    raise NameError('scraper_run is not defined.') 

seed([SearchResults(None)])
#seed([BusinessPremises('http://www.ncr.org.za/register_of_registrants/viewpremises.php?record=11296')])Xfrom scraperwiki.sqlite import save, select, execute, save_var, get_var, commit, show_tables
from scraperwiki import swimport
#from requests import session
import requests
from lxml.html import fromstring, tostring
import re
from time import time, sleep
keyify=swimport('keyify').keyify
randomsleep=swimport('randomsleep').randomsleep

# --------------------------------------------------
コード例 #34
0
    return branch

def parse_maphref(maphref):
    html=maphref.split("'")[1].replace('<br>','')
    x=fromstring(html)
    keys=["map_%s" % keyify(key) for key in x.xpath('strong/text()')]
    values=x.xpath('text()')
    return dict(zip(keys,values))

execute('CREATE TABLE IF NOT EXISTS provinces (provinceUrl TEXT )')
execute('CREATE TABLE IF NOT EXISTS cities (provinceUrl TEXT, cityUrl TEXT, FOREIGN KEY(provinceUrl) REFERENCES provinces(provinceUrl) )')
execute('CREATE TABLE IF NOT EXISTS branches (cityUrl TEXT, branchUrl TEXT, FOREIGN KEY(cityUrl) REFERENCES cities(cityUrl) )')
commit()

scraperrun = get_var('scraperrun', int(time()))
save_var('scraperrun', scraperrun)
seed([Menu(URLS['main'])])
execute('delete from swvariables where name = "scraperrun"')
commit()from lxml.html import fromstring
#from lxml.etree import fromstring
from time import time
import requests
from scraperwiki.sqlite import save,save_var, get_var, select, commit, execute
from scraperwiki import swimport
options=swimport('options').options
keyify=swimport('keyify').keyify
randomsleep=swimport('randomsleep').randomsleep
from json import loads,dumps
strip_address = swimport('strip_address').strip_address
from urllib2 import urlopen
from scraperwiki import pdftoxml
from scraperwiki.sqlite import save_var, get_var
from lxml.etree import fromstring, tostring
from unidecode import unidecode

#pdfxml = pdftoxml(urlopen('http://www.hydepark.org/schools/HPKCC%20Youth%20Programs%20Database-%20Version%203.pdf').read())
#save_var('pdfxml', unidecode(pdfxml))

pdfxml = get_var('pdfxml')
x = fromstring(pdfxml)
for page in x.xpath('//page'):
    bs = page.xpath('descendant::b/text()')
    if len(bs) == 10 and bs[0] == 'HPKCC Youth Programs Database':
        del(bs[0])

    if len(bs) != 9:
        raise ValueError('Wrong number of bold text boxes')
    elif bs[0:7] != ['Program Name', 'Program Desc.', 'Program Website', 'Program Address', 'Contact Name', 'Contact Number', 'Contact Email']:
        raise ValueError('Wrong table keys')

    updated, pagenumber = bs[7:9]

    lefts = map(int, list(set(page.xpath('text/@left'))))
    lefts.sort()
    for left in lefts:
        print [t.xpath('string()') for t in page.xpath('text[@left = "%d"]' % left)]

    # tops = map(int, list(set(page.xpath('text/@top'))))
    # tops.sort()
    # print len(tops)from urllib2 import urlopen
コード例 #36
0
def resume(levels=(4,3,2,1)):
  """Resume an incomplete scrape."""
  for level in levels:
    js=get_var(str(level))
    if js!=None:
      resume_siblings(js,level)
コード例 #37
0
        text = '\n'.join(text.split('\n')[2:4]).replace("document.getElementById('bizdir_directory').innerHTML = '", '')
        text = re.sub(r"';\s*document.getElementById('bizdir_search').disabled = false;", '', text).replace("&nbsp;&nbsp;8</div>';", '&nbsp;&nbsp;8</div>').replace("\\'", '')
        html = fromstring(text)
        bizdir_directory = []
        for tr in html.cssselect('#bizdir_directory tr'):
            try:
                assert tr.xpath('count(td)') == 1
                name = element(tr, 'td/b').text_content()
                description = element(tr, 'td/p/text()')
                bizdir_directory.append({'name': name, 'description': description, 'pageOffset': self.offset, 'scraperrun': scraperrun})
            except:
                print tostring(tr)
                raise
        save(['scraperrun', 'pageOffset', 'name'], bizdir_directory, 'organizations')

scraperrun = get_var('scraperrun', time())
save_var('scraperrun', scraperrun)
seed([Directory('http://www.chicagoistheworld.org/notalone/directory-of-youth-organizations/')])
execute('DROP TABLE stack')
execute('DROP TABLE swvariables')
commit()from lxml.html import fromstring, tostring
from time import time, sleep
import requests
from scraperwiki.sqlite import save,save_var, get_var, select, commit, execute
import re

# --------------------------------------------------
# Begin Bucket-Wheel
# --------------------------------------------------
class Stack:
    "A fancier stack, at some point"
コード例 #38
0
#!/usr/bin/env python
"""Download postsecret images"""

from urllib2 import urlopen
from lxml.html import fromstring
from scraperwiki.sqlite import save,select,NoSuchTableSqliteError,get_var,save_var
import base64

URL='http://www.postsecret.com'
WAYBACK_URL=get_var('wayback_url')


#End imports
#-----------

def wayback(url):
  """Download from the wayback machine."""
  xml=pull(url)
  try:
    parse(url,xml,suffix='_wayback')
    url=xml.xpath('//a[img[@src="http://staticweb.archive.org/images/toolbar/wm_tb_prv_on.png"]]')[0].attrib['href']
    print url
    wayback(url)
  except:
    save_var('wayback_url',url)

def parse(url,xml=None,suffix=''):
  if xml==None:
    xml=pull(url)
  sunday=xml.xpath('//h2[@class="date-header"]')[0].text
コード例 #39
0
def getroutes():
  skip=get_var('skip')
  json=urlopen("http://coach.iriscouch.com/routes/_design/coach/_view/fullRoutes?skip=%d&limit=%d" % (skip,skip+1000)).read()
  table=loads(json)['rows']
  return table
コード例 #40
0
    'CREATE TABLE IF NOT EXISTS Registrant (ScraperRun INTEGER, Record INTEGER)'
)
execute('CREATE INDEX IF NOT EXISTS RegistrantRecord ON Registrant(record)')
execute(
    'CREATE TABLE IF NOT EXISTS BusinessPremises (ScraperRun INTEGER, Record INTEGER, FOREIGN KEY(Record) REFERENCES Registrant(Record))'
)
execute(
    'CREATE INDEX IF NOT EXISTS BusinessPremisesRecord ON BusinessPremises(ScraperRun, Record)'
)
commit()

if "stack" not in show_tables() or select(
        'count(*) as "c" from stack')[0]['c'] == 0:
    save_var('scraper_run', int(time()))

scraper_run = get_var('scraper_run', None)
if scraper_run == None:
    raise NameError('scraper_run is not defined.')

seed([SearchResults(None)])
#seed([BusinessPremises('http://www.ncr.org.za/register_of_registrants/viewpremises.php?record=11296')])Xfrom scraperwiki.sqlite import save, select, execute, save_var, get_var, commit, show_tables
from scraperwiki import swimport
#from requests import session
import requests
from lxml.html import fromstring, tostring
import re
from time import time, sleep
keyify = swimport('keyify').keyify
randomsleep = swimport('randomsleep').randomsleep

コード例 #41
0
import json
import requests
import scraperwiki.sqlite as db
import time

begin = 0

counciltype = json.loads(requests.get("http://mapit.mysociety.org/areas/DIS").content)
time.sleep(1)
for council, data1 in counciltype.items():
    print data1["name"]
    if db.get_var("id") == council and begin == 0:
        begin = 1
    if begin == 1:
        db.save_var("id", council)
        children = json.loads(requests.get("http://mapit.mysociety.org/area/%s/children" % council).content)
        time.sleep(1)
        for id, data in children.items():
            json.loads(requests.get("http://mapit.mysociety.org/area/%s" % id).content)
            time.sleep(1)
            if data["type"] == "DIW":
                kml = requests.get("http://mapit.mysociety.org/area/%s.kml" % id).content
                time.sleep(1)
                councildata = {
                    "type": data["type"],
                    "parent_name": data1["name"],
                    "id": int(id),
                    "name": data["name"],
                    "kml": kml[85:-7],
                }
                db.save(["id"], councildata, verbose=0)
コード例 #42
0
def grab(from_city,from_state,to_city,to_state):
  theurl=url(from_city,from_state,to_city,to_state)
  opener = build_opener(HTTPCookieProcessor())

  try:
    o=opener.open(theurl)
  except BadStatusLine:
    return None

  xml=fromstring(o.read())
  if not route_exists(xml):
    return None

  try:
    table=xml.xpath('//table[tr[@class="tableHilightHeader"]]')[0]
  except:
    save([],{
      "from_city":from_city
    , "from_stat":from_state
    , "to_city":to_city
    , "to_state":to_state
    },'errors')
    return None

  #cities=table.xpath('tr[position()=1]/td')
  schedules=table.xpath('tr[position()>2]')
  columns=get_columns(table)

  #Get the id
  odId=get_var('origin_destination_id')
  sId=get_var('schedule_id')
  if None==odId:
    odId=1
  if None==sId:
    sId=1

  #Initialize for the loop
  d=[]
  on_fromstops=True

  for schedule in schedules:
    times=schedule.xpath('td/child::node()[position()=1]')
    #times.pop()
    #times.append(schedule.xpath('td/text()')[-1])
    print zip(times,columns)
    #assert False
    for value,column in zip(times,columns):
      if "days"==column:
        row={"key":"days"}
      elif "arrow"==column:
        on_fromstops=False
        continue
      elif "Route/Trip"==column:
        row={"key":"route_code"}

      elif on_fromstops:
        row={
          "key":"fromstop"
        , "stop":column
        }
      elif not on_fromstops:
        row={
          "key":"tostop"
        , "stop":column
        }
      #End if statement
      row.update({
        "value":value
      , "sId":sId
      , "odId":odId
      })
      d.append(row)
    #End for loop
    sId+=1
  #End for loop

  #Save origin-destination information
  save(['id'],{
    "id":odId
  , "from_city":from_city
  , "from_stat":from_state
  , "to_city":to_city
  , "to_state":to_state
  },'origin_destinations')

  #Save schedule information
  save([],d,'schedules')

  odId+=1
  save_var('origin_destination_id',odId)
  save_var('schedule_id',sId)