Python show_tables Exemples, scraperwiki.sqlite.show_tables Python Exemples

Exemple #1

0

Afficher le fichier

def main():
  """Check what has been scraped so far, then resume.
  It might be good to check for gaps in the scraping.
  Or maybe a recursive approach isn't the best for
  search pages like this."""

  #What's already been scraped recently?
  if not 'directory' in show_tables():
    last_searched=0
  else:
    #Only skip things from the current scraper completion attempt.
    if 'scrape_completions' in show_tables():
      raw_ids=select('scrape_ids from scrape_completions order by completion_id desc limit 1')[0]['scrape_ids']
      max_to_ignore=max(map(int,raw_ids.split(',')))
      min_to_scrape=max_to_ignore+1
    else:
      min_to_scrape=1
    incomplete_scrape=select('max("search_id") as m from directory where scrape_id>='+str(min_to_scrape))[0]['m']
    if incomplete_scrape!=None:
      last_searched=incomplete_scrape
    else:
      last_searched=0

  if 'scrape_times' in show_tables():
    last_id=select('max("scrape_id") as m from scrape_times')[0]['m']
  else:
    last_id=0

  #Time of scrape start
  scrape_id=last_id+1
  save(['scrape_id'],{"scrape_id":scrape_id,"scrape_time":time()},'scrape_times')
  grab(last_searched+1,{"scrape_id":scrape_id},oncompletion=oncompletion)

Exemple #2

0

Afficher le fichier

Fichier : south_dakota_lobbyists.py Projet : rayassch/scraperwiki-scraper-vault

def main():
  if 'splitnames' in show_tables():
    print "Already finished"
  elif 'lobbyists' in show_tables():
    parsenames()
  else:
    download()
    parsenames()

Exemple #3

0

Afficher le fichier

Fichier : canada_elections_contributer_table.py Projet : flyeven/scraperwiki-scraper-vault

def main():
  #What has already been scraped
  if 'contributions' in show_tables():
    scraped=[row['querystring'] for row in select('querystring from contributions')]
  else:
    scraped=[]

  pagenumber=0
  while True:
    pagenumber=pagenumber+1
    xml=load(pagenumber)

    #Get the header row
    rows=xml.xpath('//table[@class="table_text"][tr[@class="tan_row"]]')[0].getchildren()[1:]
    keys=['name','contestant_party_district','date_received','class_and_partnum','association','monetary','non-monetary']

    #Get the data rows
    ds=[]
    d={}
    for row in rows:
      cells=row.getchildren()
      contributor=cells.pop(0).getchildren()[0]

      d['querystring']=contributor.attrib['href'].replace("javascript:PopUp('contributor.aspx?",'').replace("', '300', '300');",'')
      d[keys[0]]=contributor.text
      for i in range(1,len(cells)):
        d[keys[i]]=cells[i].text
      ds.append(d)

    #Don't run again if already run
    if ds[0]['querystring'] in scraped:
      break
    else:
      save(['querystring'],ds,'contributions')

Exemple #4

0

Afficher le fichier

def main():
    #finalpage=get_var('finalpage')
    prevpage = get_var('prevpage')

    #if None==finalpage:
    if True:
        finalpage = int(get_lastpage(getpage(1)))
        save_var('finalpage', finalpage)
    if None == prevpage:
        prevpage = 1

    if prevpage < finalpage:
        step1(prevpage, finalpage)
    elif prevpage == finalpage:
        if not "step2completion" in show_tables():
            execute(
                'create table `step2completion` (`url` text, `browsed` boolean)'
            )
            execute("""
        INSERT INTO `step2completion`
        ( url , browsed )
        SELECT url, 0 as "browsed"
        FROM locations
        """)
            commit()
        step2()

Exemple #5

0

Afficher le fichier

Fichier : dbgetpy.py Projet : rayassch/scraperwiki-scraper-vault

def get_page(url,table_name="pages"):
  if not table_name in show_tables():
    raise PageNotSavedError(url)
  else:
    rows=select("`text` from %s where url=?" % table_name,[url])
    l=len(rows)
    if l==0:
      raise PageNotSavedError(url)
    elif l>1:
      raise DatastoreError(url,"Multiple rows match this url.")
    elif l==1:
      if not 'text' in rows[0].keys():
        raise DatastoreError(url,"The database does not have a `text` column.")
      else:
        return rows[0]['text']


#Tests

#import unittest
#class TestGetPage(unittest.TestCase):
#  def test_good_page(self):
#    url="https://scraperwiki.com/scrapers/dbgetpy/"
#    get_page(url)
#    row=select('* from `pages` where url=?',[url])[0]
#    assertEqual(set(row.keys()),set(["url","text"]))
#    assertIn("dbget=swimport('dbgetpy')",row['text'])

#if __name__ == '__main__':
#  print "Running tests"
#  unittest.main()
#else:
#  import os
#  print "Running from bash"
#  print os.execvp("python",["script.py"])

Exemple #6

0

Afficher le fichier

def swversion(table_name='swdata'):
    if table_name in show_tables():
        timestamp = select("max(date_extracted) as m from %s;" %
                           table_name)[0]['m']
        execute("ALTER TABLE `%s` RENAME TO `%s_%d`;" %
                (table_name, table_name, timestamp))
        commit()

Exemple #7

0

Afficher le fichier

def nextid():
    defaultquery = [{"id": 0}]
    if not OBS in show_tables():
        idquery = defaultquery
    else:
        idquery = select('max(id) as id from %s' % OBS)
        if len(idquery) == 0:
            idquery = defaultquery
    id = idquery[0]['id']
    return id

Exemple #8

0

Afficher le fichier

Fichier : eventbrite.py Projet : flyeven/scraperwiki-scraper-vault

def scrape(url,table_name="swdata", how_many = 10000):
  listurl=attendeelisturl(url)
  d=getattendeelist(listurl)
  d = getattendeelist(listurl + '&show_more=%d&sortid=0' % how_many)

  if table_name in show_tables():
    scraped_so_far=select('count(*) as "c" from `%s`'%table_name)[0]['c']
    saveattendeelist(d[0:-scraped_so_far],table_name)
  else:
    saveattendeelist(d,table_name)

Exemple #9

0

Afficher le fichier

Fichier : new_york_law.py Projet : flyeven/scraperwiki-scraper-vault

def nextid():
  defaultquery=[{"id":0}]
  if not OBS in show_tables():
    idquery=defaultquery
  else:
    idquery=select('max(id) as id from %s' % OBS)
    if len(idquery)==0:
      idquery=defaultquery
  id=idquery[0]['id']
  return id

Exemple #10

0

Afficher le fichier

def geocode():
    if "scraped" not in show_tables():
        d = swimport('csv2sw').read.csv('https://views.scraperwiki.com/run/combine_mix_scraper_spreadsheets/')
        save([], d, 'scraped')

    if "address" not in show_tables():
        initialize()

    while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0:
        address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0]

        #print address
        if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0:
            d = all_services(address['address-input'])
            for row in d:
                row['address-input'] = address['address-input']
            save([], d, 'geocode')
        params = (address['address-column'], address['address-input'])
        execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params )
        commit()

Exemple #11

0

Afficher le fichier

Fichier : scraper.py Projet : ozgurozkan123/eventbrite

def scrape(url, table_name="swdata", how_many=10000):
    listurl = attendeelisturl(url)
    d = getattendeelist(listurl)
    d = getattendeelist(listurl + '&show_more=%d&sortid=0' % how_many)

    if table_name in show_tables():
        scraped_so_far = select('count(*) as "c" from `%s`' %
                                table_name)[0]['c']
        saveattendeelist(d[0:-scraped_so_far], table_name)
    else:
        saveattendeelist(d, table_name)

Exemple #12

0

Afficher le fichier

Fichier : geocode_all_mix_scrapers_2.py Projet : flyeven/scraperwiki-scraper-vault

def geocode():
    if "scraped" not in show_tables():
        d = swimport('csv2sw').read.csv('https://views.scraperwiki.com/run/combine_mix_scraper_spreadsheets/')
        save([], d, 'scraped')

    if "address" not in show_tables():
        initialize()

    while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0:
        address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0]

        #print address
        if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0:
            d = all_services(address['address-input'])
            for row in d:
                row['address-input'] = address['address-input']
            save([], d, 'geocode')
        params = (address['address-column'], address['address-input'])
        execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params )
        commit()

Exemple #13

0

Afficher le fichier

Fichier : scraperwiki_featured_scrapers.py Projet : flyeven/scraperwiki-scraper-vault

def check_identical_screenshot(image_base64):
  """Check whether there's an identical screenshot already saved"""

  #If,else to handle new tables
  if 'images' in show_tables():
    identical_screenshot=select('screenshot_id from images where image="'+image_base64+'" limit 1')
  else:
    identical_screenshot=[]

  if len(identical_screenshot)==0:
    #No identical screenshot
    if 'images' in show_tables():
      screenshot_id=select('max(screenshot_id) as id from images')[0]['id']+1
    else:
      screenshot_id=1
    return (False,{
      "screenshot_id":screenshot_id
    , "image":image_base64
    })
  elif len(identical_screenshot)==1:
    return (True,identical_screenshot[0])

Exemple #14

0

Afficher le fichier

Fichier : scraperwiki_featured_scrapers.py Projet : pombredanne/scraperwiki-scraper-vault

def check_identical_screenshot(image_base64):
    """Check whether there's an identical screenshot already saved"""

    #If,else to handle new tables
    if 'images' in show_tables():
        identical_screenshot = select(
            'screenshot_id from images where image="' + image_base64 +
            '" limit 1')
    else:
        identical_screenshot = []

    if len(identical_screenshot) == 0:
        #No identical screenshot
        if 'images' in show_tables():
            screenshot_id = select(
                'max(screenshot_id) as id from images')[0]['id'] + 1
        else:
            screenshot_id = 1
        return (False, {"screenshot_id": screenshot_id, "image": image_base64})
    elif len(identical_screenshot) == 1:
        return (True, identical_screenshot[0])

Exemple #15

0

Afficher le fichier

Fichier : geocode_all_mix_scrapers_final_for_south_africa.py Projet : rayassch/scraperwiki-scraper-vault

def geocode():
    if "scraped" not in show_tables():
        d = swimport('csv2sw').read.csv('http://hacks.thomaslevine.com/all.csv')
        save([], d, 'scraped')
        execute('DELETE FROM `scraped` WHERE `Country` != "South Africa"')
        commit()

    if "address" not in show_tables():
        initialize()

    while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0:
        address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0]

        #print address
        if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0:
            d = all_services(address['address-input'])
            for row in d:
                row['address-input'] = address['address-input']
            save([], d, 'geocode')
        params = (address['address-column'], address['address-input'])
        execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params )
        commit()

Exemple #16

0

Afficher le fichier

Fichier : scraperwiki_scraper_urls.py Projet : flyeven/scraperwiki-scraper-vault

def go(number=1,pagetype="SCRAPERS"):
  foo=scrapepage(number,pagetype)
  is_end=('scraper_urls' in show_tables()) and (foo['lasturl'] in select('url from scraper_urls'))
  #Save after checking whether it's the end because that's how I check.
  save(['url'],foo['scraper_urls'],'scraper_urls')

  if foo['lastpage']:
    #End when we reach the last page
    print "I scraped all the scrapers!"
  elif is_end:
    #End when we reach page where a scraper has already been scraped
    print "I scraped all of the new scrapers!"
  else:
    go(number+1,pagetype)

Exemple #17

0

Afficher le fichier

Fichier : nyc_lobbyist_directory_browser.py Projet : rayassch/scraperwiki-scraper-vault

def atomic():
  if "client"==pagetype(get_var('previous_href')):
    table_names=CLIENT_TABLES
  elif "lobbyist"==pagetype(get_var('previous_href')):
    table_names=LOBBYIST_TABLES
  else:
    raise ResumeError('The type of the previous href, "%s", could not be determined.' % get_var('previous_href'))

  if "clients_urls" in show_tables():
    sourceUrl=select('distinct sourceUrl as "s" from `clients_urls` where jobId=(select max(jobId) from `clients_urls`)')[0]['s']
    for table_name in table_names:
      execute('DELETE FROM `%s` where jobId in (select jobId from clients_urls where sourceUrl="%s")' % (table_name,sourceUrl))
    commit()
    return sourceUrl

Exemple #18

0

Afficher le fichier

def main():
  if not 'cities_done' in show_tables():
    cities_done=[]
  else:
    cities_done=select('* from cities_done')

  for fromcity in CITIES_NY:
    for tocity in CITIES_NY:
      if fromcity==tocity:
        print 'Skipping within-%s route' % fromcity
      elif {"from":fromcity,"to":tocity} in cities_done:
        print 'Already scraped %s to %s' % (fromcity,tocity)
      else:
        grab(fromcity,"NY",tocity,"NY")
        save([],{"from":fromcity,"to":tocity},'cities_done')

Exemple #19

0

Afficher le fichier

Fichier : dbgetpy.py Projet : rayassch/scraperwiki-scraper-vault

def get_page(url,table_name="pages"):
  if not table_name in show_tables():
    raise PageNotSavedError(url)
  else:
    rows=select("`text` from %s where url=?" % table_name,[url])
    l=len(rows)
    if l==0:
      raise PageNotSavedError(url)
    elif l>1:
      raise DatastoreError(url,"Multiple rows match this url.")
    elif l==1:
      if not 'text' in rows[0].keys():
        raise DatastoreError(url,"The database does not have a `text` column.")
      else:
        return rows[0]['text']

Exemple #20

0

Afficher le fichier

Fichier : nyc_lobbyist_directory_browser.py Projet : rayassch/scraperwiki-scraper-vault

def get_scraper_state():
  all_views=[row['value'] for row in select('value FROM views ORDER BY value', verbose=False)]
  if 'links' not in show_tables():
    years_to_do=[row['value'] for row in select('value FROM years ORDER BY value', verbose=False)]
    remaining_views_this_year=all_views
  else:
    finished=select('max(view) as "view",year from links where year=(select max(year) from links)', verbose=False)
    years_to_do=[row['value'] for row in select('value FROM years WHERE value>"%s" ORDER BY value' % finished[0]['year'], verbose=False)]
    remaining_views_this_year=[row['value'] for row in select('value from views where value>"%s"' % finished[0]['view'], verbose=False)]
    del(finished)
  return {
    "all-views":all_views
  , "years-to-do":years_to_do
  , "remaining-views-this-year":remaining_views_this_year
  }

Exemple #21

0

Afficher le fichier

Fichier : bidvest_bank.py Projet : rayassch/scraperwiki-scraper-vault

def moreparsing_map():
  "Map along the most recent results in the table (like a Couch map) and return a new one"
  d=select("* FROM `swdata` WHERE date_scraped=(SELECT max(date_scraped) from `swdata`);")
  for row in d:
    row['street-address'],row['postal-code']=splitAddress(row['Address_'])
    row['town']=extractTown(row['branchName'])
  if 'final' in show_tables():
    execute('DROP TABLE `final`;')

  d_final = []
  for row in d:
    if row['regionName'] not in ["Botswana", "Malawi", "Nambia"]:
      d_final.append(row)

  save([],d_final,'final')

Exemple #22

0

Afficher le fichier

def geocode():
    if "address" not in show_tables():
        initialize()

    while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0:
        address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0]
        #print address
        if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0:
            d = all_services(address['address-input'])
            for row in d:
                row['address-input'] = address['address-input']
            save([], d, 'geocode')
        params = (address['address-column'], address['address-input'])
        execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params )
        commit()

Exemple #23

0

Afficher le fichier

Fichier : scraperwiki_featured_scrapers.py Projet : pombredanne/scraperwiki-scraper-vault

def parse(url, xml=None, suffix=''):
    if xml == None:
        xml = pull(url)
    print "Loading the page"
    scrapers = xml.xpath(PATH)
    for scraper in scrapers:
        if 'observations' in show_tables():
            observation_id = select(
                'max(observation_id) as id from observations')[0]['id'] + 1
        else:
            observation_id = 1
        identifiers = {"observation_id": observation_id}
        info = copy(identifiers)
        screenshot_identity = copy(identifiers)

        identifiers['time_scraped'] = time()
        identifiers['url'] = scraper.xpath('a')[0].attrib['href']

        print "Extracting metadata"
        info['owner'], info['title'] = scraper.xpath('a/h4')[0].text.split(
            '/', 1)
        info['language'], info['type'] = re.split(
            r'[^a-zA-Z]+',
            scraper.xpath('a/span[@class="about"]')[0].text)
        info['created'] = scraper.xpath('a/span[@class="when"]')[0].text

        screenshot_identity['url'] = scraper.xpath('a/img')[0].attrib['src']
        print "Checking whether I've already saved the screenshot"
        exists, image = check_identical_screenshot(
            getimage(screenshot_identity['url']))
        if exists:
            #If I have, don't do anything with theimage
            print "Screenshot already saved"
        else:
            #If I haven't, save a new image
            print "Saving the new screenshot"
            image['observation_scraped_on'] = observation_id
            save(['observation_scraped_on', 'screenshot_id'], image, 'images')

        #Either way, link the observation to the saved image
        screenshot_identity['screenshot_id'] = image['screenshot_id']
        save(['observation_id'], screenshot_identity, 'screenshot_identidies')

        #Save these at the end to avoid partial rows
        print "Saving"
        save(['observation_id'], info, 'homepage_metadata')
        save(['observation_id'], identifiers, 'observations')

Exemple #24

0

Afficher le fichier

def oncompletion():
  scrape_ids=[str(row['scrape_id']) for row in select('scrape_id from scrape_times')]
  if 'scrape_completions' in show_tables():
    #Increment id
    completion_id=1+select('max("completion_id") as m from scrape_completions')[0]['m']
    #Remove old scrape_ids
    completion_rows=[row['scrape_ids'] for row in select('scrape_ids from scrape_completions')]
    old_scrapes=(','.join(completion_rows)).split(',')
    for old_scrape in old_scrapes:
      scrape_ids.remove(old_scrape)
  else:
    completion_id=1
  d={
    "completion_id":completion_id
  , "scrape_ids":','.join(scrape_ids)
  }
  save(['completion_id'],d,'scrape_completions')

Exemple #25

0

Afficher le fichier

Fichier : nyc_lobbyist_directory_browser.py Projet : rayassch/scraperwiki-scraper-vault

  def _parse_and_save(self,SpecificDataRow,maintable):
    "Clean up stuff"

    #Skip the raw parse
    #job_raw=self.rawparse()
    #for row in job_raw:
    #  row['url']=self.url
    #save([],job_raw,maintable+'raw',verbose=False)

    for tr in self.getTableRows():
      #Get the next jobId
      if maintable in show_tables():
        jobId=select('max(jobId) as "jobId" from `%s`' % maintable, verbose=False)[0]['jobId']+1
      else:
        jobId=1

      r=SpecificDataRow(tr,jobId,self.url)
      r.parse_and_save()

Exemple #26

0

Afficher le fichier

Fichier : canada_elections_contributer_table.py Projet : rayassch/scraperwiki-scraper-vault

def main():
    #What has already been scraped
    if 'contributions' in show_tables():
        scraped = [
            row['querystring']
            for row in select('querystring from contributions')
        ]
    else:
        scraped = []

    pagenumber = 0
    while True:
        pagenumber = pagenumber + 1
        xml = load(pagenumber)

        #Get the header row
        rows = xml.xpath('//table[@class="table_text"][tr[@class="tan_row"]]'
                         )[0].getchildren()[1:]
        keys = [
            'name', 'contestant_party_district', 'date_received',
            'class_and_partnum', 'association', 'monetary', 'non-monetary'
        ]

        #Get the data rows
        ds = []
        d = {}
        for row in rows:
            cells = row.getchildren()
            contributor = cells.pop(0).getchildren()[0]

            d['querystring'] = contributor.attrib['href'].replace(
                "javascript:PopUp('contributor.aspx?",
                '').replace("', '300', '300');", '')
            d[keys[0]] = contributor.text
            for i in range(1, len(cells)):
                d[keys[i]] = cells[i].text
            ds.append(d)

        #Don't run again if already run
        if ds[0]['querystring'] in scraped:
            break
        else:
            save(['querystring'], ds, 'contributions')

Exemple #27

0

Afficher le fichier

Fichier : does_anyone_use_joins.py Projet : rayassch/scraperwiki-scraper-vault

def main():
    if "urls" not in show_tables():
        copyUrlsDb()
    for url in getUrls():
        slug = getScraperSlug(url)
        code, user = getCode(slug)
        if code != None:
            c = code.lower()
            save(['url'], {
                "code":code, "user": user, "url": url,
                "has_join": " join " in c,
                "has_attach": "attach" in c,
                "has_twitter": "twitter" in c,
            })
        execute('UPDATE `urls` SET `scraped`=1 WHERE `url` = ?', url)
        commit()

    d = select('`user`, count(*) AS "attach-and-join-count" from `swdata` WHERE (`has_join` = 1 and `has_attach` = 1) GROUP BY `user`')
    save(['user'], d, 'results')

Exemple #28

0

Afficher le fichier

Fichier : scraperwiki_featured_scrapers.py Projet : flyeven/scraperwiki-scraper-vault

def parse(url,xml=None,suffix=''):
  if xml==None:
    xml=pull(url)
  print "Loading the page"
  scrapers=xml.xpath(PATH)
  for scraper in scrapers:
    if 'observations' in show_tables():
      observation_id=select('max(observation_id) as id from observations')[0]['id']+1
    else:
      observation_id=1
    identifiers={"observation_id":observation_id}
    info=copy(identifiers)
    screenshot_identity=copy(identifiers)

    identifiers['time_scraped']=time()
    identifiers['url']=scraper.xpath('a')[0].attrib['href']

    print "Extracting metadata"
    info['owner'],info['title']=scraper.xpath('a/h4')[0].text.split('/',1)
    info['language'],info['type']=re.split(r'[^a-zA-Z]+',scraper.xpath('a/span[@class="about"]')[0].text)
    info['created']=scraper.xpath('a/span[@class="when"]')[0].text

    screenshot_identity['url']=scraper.xpath('a/img')[0].attrib['src']
    print "Checking whether I've already saved the screenshot"
    exists,image=check_identical_screenshot(getimage(screenshot_identity['url']))
    if exists:
      #If I have, don't do anything with theimage
      print "Screenshot already saved"
    else:
      #If I haven't, save a new image
      print "Saving the new screenshot"
      image['observation_scraped_on']=observation_id
      save(['observation_scraped_on','screenshot_id'],image,'images')

    #Either way, link the observation to the saved image
    screenshot_identity['screenshot_id']=image['screenshot_id']
    save(['observation_id'],screenshot_identity,'screenshot_identidies')

    #Save these at the end to avoid partial rows
    print "Saving"
    save(['observation_id'],info,'homepage_metadata')
    save(['observation_id'],identifiers,'observations')

Exemple #29

0

Afficher le fichier

Fichier : new_york_law.py Projet : flyeven/scraperwiki-scraper-vault

def resume_siblings(js,level):
  if level==1:
    print "Finished resuming"
  elif not OBS in show_tables():
    pass
  else:
    parent=select('parentjs from %s order by date_scraped desc limit 1' % OBS)[0]['parentjs']
    foo,bar,baz=(eval(parent.replace('getlaw','')))
    xml=fromstring(getlaw(foo,bar,baz))
    links=get_law_links(xml,parent)
    linkslist=[link['observation']['js'] for link in links]
    if not js in linkslist:
      #It looks like the last sibling scraped was the last child of its parent;
      #None of its siblings need to be scraped
      pass
    else:
      first=linkslist.index(js)+1
      last=len(linkslist)
      print level,first,last
      if first<last:
        for link in linkslist[first:last]:
          search_directory_tree(link,level)

Exemple #30

0

Afficher le fichier

Fichier : wheel_tractor-scrapers.py Projet : rayassch/scraperwiki-scraper-vault

def main():
  if 'productlines' not in show_tables():
    save(['href'],getproductlinelinks(MENU),'productlines')
  hrefs=[row['href'] for row in select('href from productlines')]
  for href in hrefs:
    p=ProductLine(href)
    t=p.current_models_table()

    #Overview
    save(['href'],p.overview(),'overview')

    #Specifications
    save([],t.specifications(units="english"),'specifications')
    save([],t.specifications(units="metric"),'specifications')

    #Links to models
    model_links=t.model_links()
    for model_link in model_links:
      model_link['product-line-href']=p.href
    save(['href'],model_links,'models')

    #Links to non-current models
    save([],p.noncurrent_models_link(),'current_noncurrent')

Exemple #31

0

Afficher le fichier

def resume_siblings(js, level):
    if level == 1:
        print "Finished resuming"
    elif not OBS in show_tables():
        pass
    else:
        parent = select('parentjs from %s order by date_scraped desc limit 1' %
                        OBS)[0]['parentjs']
        foo, bar, baz = (eval(parent.replace('getlaw', '')))
        xml = fromstring(getlaw(foo, bar, baz))
        links = get_law_links(xml, parent)
        linkslist = [link['observation']['js'] for link in links]
        if not js in linkslist:
            #It looks like the last sibling scraped was the last child of its parent;
            #None of its siblings need to be scraped
            pass
        else:
            first = linkslist.index(js) + 1
            last = len(linkslist)
            print level, first, last
            if first < last:
                for link in linkslist[first:last]:
                    search_directory_tree(link, level)

Exemple #32

0

Afficher le fichier

Fichier : scraperwiki_script_directory_hacks.py Projet : flyeven/scraperwiki-scraper-vault

from scraperwiki.sqlite import save_var,execute,commit,show_tables
import os

if "swvariables" in show_tables():
  execute("DROP TABLE swvariables;")

before=set(os.listdir('.'))
save_var('foo','bar')
#os.system('rm *.pyc')
after=set(os.listdir('.'))

#print before-after
#print after
s=[]
for f in after:
  if f[0:4]!='data' and f[-3:-1]!='pyc':
    s.append(f)

print s

baz=[]
baz.append('script.rb')
baz.append('.cache')
for f in baz:
  print open(f).read()
from scraperwiki.sqlite import save_var,execute,commit,show_tables
import os

if "swvariables" in show_tables():
  execute("DROP TABLE swvariables;")

Exemple #33

0

Afficher le fichier

Fichier : scraperwiki_script_directory_hacks.py Projet : rayassch/scraperwiki-scraper-vault

from scraperwiki.sqlite import save_var, execute, commit, show_tables
import os

if "swvariables" in show_tables():
    execute("DROP TABLE swvariables;")

before = set(os.listdir('.'))
save_var('foo', 'bar')
#os.system('rm *.pyc')
after = set(os.listdir('.'))

#print before-after
#print after
s = []
for f in after:
    if f[0:4] != 'data' and f[-3:-1] != 'pyc':
        s.append(f)

print s

baz = []
baz.append('script.rb')
baz.append('.cache')
for f in baz:
    print open(f).read()
from scraperwiki.sqlite import save_var, execute, commit, show_tables
import os

if "swvariables" in show_tables():
    execute("DROP TABLE swvariables;")

Exemple #34

0

Afficher le fichier

Fichier : ncr5.py Projet : flyeven/scraperwiki-scraper-vault

                row.update({'premises_name': premises_name, 'town': town,})
            else:
                row['enter_manually'] = 1

            row.update({'date_scraped': time(), 'ScraperRun': scraper_run, 'url': self.url, 'Record': int(self.url.split('=')[-1])})
            data.append(row)

        save([], data, 'BusinessPremises')

execute('CREATE TABLE IF NOT EXISTS Registrant (ScraperRun INTEGER, Record INTEGER)')
execute('CREATE INDEX IF NOT EXISTS RegistrantRecord ON Registrant(record)')
execute('CREATE TABLE IF NOT EXISTS BusinessPremises (ScraperRun INTEGER, Record INTEGER, FOREIGN KEY(Record) REFERENCES Registrant(Record))')
execute('CREATE INDEX IF NOT EXISTS BusinessPremisesRecord ON BusinessPremises(ScraperRun, Record)')
commit()

if "stack" not in show_tables() or select('count(*) as "c" from stack')[0]['c'] == 0:
    save_var('scraper_run', int(time()))

scraper_run = get_var('scraper_run', None)
if scraper_run == None:
    raise NameError('scraper_run is not defined.') 

seed([SearchResults(None)])
#seed([BusinessPremises('http://www.ncr.org.za/register_of_registrants/viewpremises.php?record=11296')])Xfrom scraperwiki.sqlite import save, select, execute, save_var, get_var, commit, show_tables
from scraperwiki import swimport
#from requests import session
import requests
from lxml.html import fromstring, tostring
import re
from time import time, sleep
keyify=swimport('keyify').keyify

Exemple #35

0

Afficher le fichier

Fichier : ncr5.py Projet : pombredanne/scraperwiki-scraper-vault

        save([], data, 'BusinessPremises')


execute(
    'CREATE TABLE IF NOT EXISTS Registrant (ScraperRun INTEGER, Record INTEGER)'
)
execute('CREATE INDEX IF NOT EXISTS RegistrantRecord ON Registrant(record)')
execute(
    'CREATE TABLE IF NOT EXISTS BusinessPremises (ScraperRun INTEGER, Record INTEGER, FOREIGN KEY(Record) REFERENCES Registrant(Record))'
)
execute(
    'CREATE INDEX IF NOT EXISTS BusinessPremisesRecord ON BusinessPremises(ScraperRun, Record)'
)
commit()

if "stack" not in show_tables() or select(
        'count(*) as "c" from stack')[0]['c'] == 0:
    save_var('scraper_run', int(time()))

scraper_run = get_var('scraper_run', None)
if scraper_run == None:
    raise NameError('scraper_run is not defined.')

seed([SearchResults(None)])
#seed([BusinessPremises('http://www.ncr.org.za/register_of_registrants/viewpremises.php?record=11296')])Xfrom scraperwiki.sqlite import save, select, execute, save_var, get_var, commit, show_tables
from scraperwiki import swimport
#from requests import session
import requests
from lxml.html import fromstring, tostring
import re
from time import time, sleep

Exemple #36

0

Afficher le fichier

Fichier : tahrirsupplies_map_1.py Projet : yuandra/scraperwiki-scraper-vault

 def is_new(this):
   """Check whether I've already saved it"""
   if 'tweets' not in show_tables():
     return True
   else:
     return 0==select('count(*) as c from tweets where id="%s"' % this._tweet['id'])[0]['c']

Exemple #37

0

Afficher le fichier

Fichier : swversion.py Projet : flyeven/scraperwiki-scraper-vault

def swversion(table_name='swdata'):
  if table_name in show_tables():
    timestamp=select("max(date_extracted) as m from %s;" % table_name)[0]['m']
    execute("ALTER TABLE `%s` RENAME TO `%s_%d`;"%(table_name,table_name,timestamp))
    commit()