def scraperwiki():
    #zipcodes=get_zips(urlopen(URLS['zips']))
    attach('us_zip_codes')
    zipcodes = [str(row['zip']) for row in select('zip from zipcodes')]

    #Skip zipcodes that are already finished.
    try:
        finished_zipcodes = [
            row['zipcode'] for row in select('zipcode from finished_zipcodes')
        ]
    except:
        pass
    else:
        #print 'Already scraped these zipcodes:'
        for zipcode in finished_zipcodes:
            try:
                zipcodes.remove(zipcode)
            except ValueError:
                #The zipcodes database isn't complete
                pass

    for zipcode in zipcodes:
        print 'Scraping ' + zipcode
        lastpage = int(get_lastpage(search(zipcode, '1', save=False)))
        for page in [str(p) for p in range(1, lastpage + 1)]:
            theaters = get_theaters(zipcode, page, save=False)
            for theater in theaters:
                info = theater_info(theater)
                info = clean_info(info)
                save(['url'], info2dictRow(info, zipcode), 'locations')
            sleep(INTERVAL)
        save(['zipcode'], {'zipcode': zipcode}, 'finished_zipcodes')
Esempio n. 2
0
def main():
  """Check what has been scraped so far, then resume.
  It might be good to check for gaps in the scraping.
  Or maybe a recursive approach isn't the best for
  search pages like this."""

  #What's already been scraped recently?
  if not 'directory' in show_tables():
    last_searched=0
  else:
    #Only skip things from the current scraper completion attempt.
    if 'scrape_completions' in show_tables():
      raw_ids=select('scrape_ids from scrape_completions order by completion_id desc limit 1')[0]['scrape_ids']
      max_to_ignore=max(map(int,raw_ids.split(',')))
      min_to_scrape=max_to_ignore+1
    else:
      min_to_scrape=1
    incomplete_scrape=select('max("search_id") as m from directory where scrape_id>='+str(min_to_scrape))[0]['m']
    if incomplete_scrape!=None:
      last_searched=incomplete_scrape
    else:
      last_searched=0

  if 'scrape_times' in show_tables():
    last_id=select('max("scrape_id") as m from scrape_times')[0]['m']
  else:
    last_id=0

  #Time of scrape start
  scrape_id=last_id+1
  save(['scrape_id'],{"scrape_id":scrape_id,"scrape_time":time()},'scrape_times')
  grab(last_searched+1,{"scrape_id":scrape_id},oncompletion=oncompletion)
Esempio n. 3
0
def analyze():
    d = select("""
  `link-href`, GROUP_CONCAT(`author`) AS `authors`, count(*) AS "count"
FROM `links`
JOIN `topics` ON `links`.`topic-href` = `topics`.`topic-href`
GROUP BY `link-href`
""")
    execute('DROP TABLE IF EXISTS `wrote-about-same-things`')
    save([], d, 'wrote-about-same-things')
    print '''
These look most exciting because three different people wrote about each.

3    Kiana Fitzgerald,Sara Peralta,Susan Raybuck    http://schedule.sxsw.com/2012/events/event_IAP100409
3    Shawn Dullye,Joe Vasquez,Sara Peralta          http://schedule.sxsw.com/2012/events/event_IAP10593
3    Shawn Dullye,Kiana Fitzgerald,Sara Peralta     http://schedule.sxsw.com/2012/events/event_IAP13848

Of course, that isn't adjusted for how many each person wrote.
'''

    d = select("""
  author, count(*) AS `how-many` FROM `links`
JOIN topics on links.`topic-href` = topics.`topic-href`
GROUP BY author
ORDER BY 2 DESC
""")
    save(['author'], d, 'how-many-did-you-link')
    print """
def scraperwiki():
  #zipcodes=get_zips(urlopen(URLS['zips']))
  attach('us_zip_codes')
  zipcodes=[str(row['zip']) for row in select('zip from zipcodes')]

  #Skip zipcodes that are already finished.
  try:
    finished_zipcodes=[row['zipcode'] for row in select('zipcode from finished_zipcodes')]
  except:
    pass
  else:
    #print 'Already scraped these zipcodes:'
    for zipcode in finished_zipcodes:
      try:
        zipcodes.remove(zipcode)
      except ValueError:
        #The zipcodes database isn't complete
        pass

  for zipcode in zipcodes:
    print 'Scraping '+zipcode
    lastpage=int(get_lastpage(search(zipcode,'1',save=False)))
    for page in [str(p) for p in range(1,lastpage+1)]:
      theaters=get_theaters(zipcode,page,save=False)
      for theater in theaters:
        info=theater_info(theater)
        info=clean_info(info)
        save(['url'],info2dictRow(info,zipcode),'locations')
      sleep(INTERVAL)
    save(['zipcode'],{'zipcode':zipcode},'finished_zipcodes')
Esempio n. 5
0
def find_similar_research():
    research = select('url, plaintext as "value" from maincol where url != ?;',
                      [reference_person])
    research.extend(
        select('url, plaintext as "value" from maincol where url = ?;',
               [reference_person]))
    documents = [row['value'].strip() for row in research]
    stoplist = set('for a of the and to in'.split())
    texts = [[
        word for word in document.lower().split() if word not in stoplist
    ] for document in documents]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    vec = corpus[-1]  #The person being compared to

    tfidf = models.TfidfModel(corpus)
    index = similarities.SparseMatrixSimilarity(tfidf[corpus])
    sims = index[tfidf[vec]]

    print list(enumerate(sims))
    save(['url'], [{
        "url": row[0],
        "similarity": row[1][1]
    } for row in zip([row['url'] for row in research], list(enumerate(sims)))],
         'similarity')
def test_wrapper(test_func):
  attach('scraperwiki_events_eventbrite_guestlists')
  original_data=select('* from `ny` where `Twitter Handle`="thomaslevine";')

  table_name=test_func(original_data)
  sleep(5)

  attach('ajax_tractor')
  ajaxed_data=select('* from `%s` where `Twitter Handle`="thomaslevine";' % table_name)
  print original_data,ajaxed_data
  print original_data==ajaxed_data
def go():
  attach('new_mexico_state_audits')
  nodeIds=[row['nodeId'] for row in select('nodeId from nodeIds')]
  scraped_nodeIds=[row['nodeId'] for row in select('nodeId from opinions order by time_scraped')] #So you get different information from consecutive partial runs.
  for nodeId in scraped_nodeIds:
    nodeIds.remove(nodeId)
  if len(nodeIds)==0:
    nodeIds=scraped_nodeIds
  for nodeId in nodeIds:
    print 'Scraping node '+nodeId
    parse(nodeId)
Esempio n. 8
0
def go():
    attach('new_mexico_state_audits')
    nodeIds = [row['nodeId'] for row in select('nodeId from nodeIds')]
    scraped_nodeIds = [
        row['nodeId']
        for row in select('nodeId from opinions order by time_scraped')
    ]  #So you get different information from consecutive partial runs.
    for nodeId in scraped_nodeIds:
        nodeIds.remove(nodeId)
    if len(nodeIds) == 0:
        nodeIds = scraped_nodeIds
    for nodeId in nodeIds:
        print 'Scraping node ' + nodeId
        parse(nodeId)
def get_scraper_state():
  all_views=[row['value'] for row in select('value FROM views ORDER BY value', verbose=False)]
  if 'links' not in show_tables():
    years_to_do=[row['value'] for row in select('value FROM years ORDER BY value', verbose=False)]
    remaining_views_this_year=all_views
  else:
    finished=select('max(view) as "view",year from links where year=(select max(year) from links)', verbose=False)
    years_to_do=[row['value'] for row in select('value FROM years WHERE value>"%s" ORDER BY value' % finished[0]['year'], verbose=False)]
    remaining_views_this_year=[row['value'] for row in select('value from views where value>"%s"' % finished[0]['view'], verbose=False)]
    del(finished)
  return {
    "all-views":all_views
  , "years-to-do":years_to_do
  , "remaining-views-this-year":remaining_views_this_year
  }
def join():
  disclosures=select('Entity,upper(Entity) as "ENTITY" from disclosures where entity is not null')
  disclosures_cleaned=[{
    "raw":row['Entity']
  , "clean":remove_ny(row['ENTITY']).strip()
  } for row in disclosures]
  save([],disclosures_cleaned,'disclosures_cleaned')


  licenses=select('Vendor,upper(Vendor) as "VENDOR" from swdata where Vendor is not null')
  licenses_cleaned=[{
    "raw":row['Vendor']
  , "clean":remove_ny(row['VENDOR']).strip()
  } for row in licenses]
  save([],licenses_cleaned,'licenses_cleaned')
Esempio n. 11
0
def geocode():
    if "address" not in show_tables():
        initialize()

    while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0:
        address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0]
        #print address
        if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0:
            d = all_services(address['address-input'])
            for row in d:
                row['address-input'] = address['address-input']
            save([], d, 'geocode')
        params = (address['address-column'], address['address-input'])
        execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params )
        commit()
def find_similar_research():
    research = select('url, value from maincol where url != ?;', [reference_person])
    research.extend(select('url, value from descriptions where url = ?;', [reference_person]))
    documents = [row['value'].strip() for row in research]
    stoplist = set('for a of the and to in'.split())
    texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    vec = corpus.pop() #The person being compared to
    
    tfidf = models.TfidfModel(corpus)
    index = similarities.SparseMatrixSimilarity(tfidf[corpus])
    sims = index[tfidf[vec]]

    save(['url'], [{"url": row[0], "similarity": row[1][0]} for row in zip([row['url'] for row in research], list(enumerate(sims)))], 'similarity')
Esempio n. 13
0
    def update_item(self, item, item_type, unique_keys, table_name):
    	now = datetime.utcnow()
        where = ' and '.join([("%s = '%s'" % (ukey, item[ukey]))
            for ukey in unique_keys[item_type]])
        sqlquery = "* from %s where %s" % (table_name, where)
        
        try:
            item_in_database = sqlite.select(sqlquery)
            saved_item = { key: value
                for key, value in item_in_database[0].iteritems()
                if value and key not in ['created', 'modified'] 
            } # because the item to be saved doesn't have these fields

            current_item = { key: value
                for key, value in item.iteritems()
                if value
            } # ignoring blank/Nones etc.

            item['modified'] = (item_in_database[0]['modified']
                if current_item == saved_item and item_in_database[0]['modified']
                else now)

            item['created'] = (item_in_database[0]['created']
                if item_in_database[0]['created']
                else now)

        except (sqlite.SqliteError, IndexError, KeyError):
            item['created'] = item['modified'] = now

        return item
Esempio n. 14
0
def swversion(table_name='swdata'):
    if table_name in show_tables():
        timestamp = select("max(date_extracted) as m from %s;" %
                           table_name)[0]['m']
        execute("ALTER TABLE `%s` RENAME TO `%s_%d`;" %
                (table_name, table_name, timestamp))
        commit()
Esempio n. 15
0
def extract_postcodes():
  sql = ' `rowid`, `address` from `branches`;'
  for row in select(sql):
    postcodes = findall(r'[0-9]{4}', row['address'])
    if len(postcodes) != 0:
      execute("UPDATE `branches` SET `postcode` = ? WHERE `rowid` = ? ", (postcodes[-1], row['rowid']) )
  commit()
def get_page(url,table_name="pages"):
  if not table_name in show_tables():
    raise PageNotSavedError(url)
  else:
    rows=select("`text` from %s where url=?" % table_name,[url])
    l=len(rows)
    if l==0:
      raise PageNotSavedError(url)
    elif l>1:
      raise DatastoreError(url,"Multiple rows match this url.")
    elif l==1:
      if not 'text' in rows[0].keys():
        raise DatastoreError(url,"The database does not have a `text` column.")
      else:
        return rows[0]['text']


#Tests

#import unittest
#class TestGetPage(unittest.TestCase):
#  def test_good_page(self):
#    url="https://scraperwiki.com/scrapers/dbgetpy/"
#    get_page(url)
#    row=select('* from `pages` where url=?',[url])[0]
#    assertEqual(set(row.keys()),set(["url","text"]))
#    assertIn("dbget=swimport('dbgetpy')",row['text'])

#if __name__ == '__main__':
#  print "Running tests"
#  unittest.main()
#else:
#  import os
#  print "Running from bash"
#  print os.execvp("python",["script.py"])
def main():
  #What has already been scraped
  if 'contributions' in show_tables():
    scraped=[row['querystring'] for row in select('querystring from contributions')]
  else:
    scraped=[]

  pagenumber=0
  while True:
    pagenumber=pagenumber+1
    xml=load(pagenumber)

    #Get the header row
    rows=xml.xpath('//table[@class="table_text"][tr[@class="tan_row"]]')[0].getchildren()[1:]
    keys=['name','contestant_party_district','date_received','class_and_partnum','association','monetary','non-monetary']

    #Get the data rows
    ds=[]
    d={}
    for row in rows:
      cells=row.getchildren()
      contributor=cells.pop(0).getchildren()[0]

      d['querystring']=contributor.attrib['href'].replace("javascript:PopUp('contributor.aspx?",'').replace("', '300', '300');",'')
      d[keys[0]]=contributor.text
      for i in range(1,len(cells)):
        d[keys[i]]=cells[i].text
      ds.append(d)

    #Don't run again if already run
    if ds[0]['querystring'] in scraped:
      break
    else:
      save(['querystring'],ds,'contributions')
 def pop(self):
     query = select('* from stack where rowid = (select max(rowid) from stack)')
     instantiate = "%s(%s)" % (query[0]['classname'], dumps(query[0]['url']))
     print instantiate
     obj = eval(instantiate)
     justpopped = obj
     return obj
Esempio n. 19
0
def oncompletion():
  scrape_ids=[str(row['scrape_id']) for row in select('scrape_id from scrape_times')]
  if 'scrape_completions' in show_tables():
    #Increment id
    completion_id=1+select('max("completion_id") as m from scrape_completions')[0]['m']
    #Remove old scrape_ids
    completion_rows=[row['scrape_ids'] for row in select('scrape_ids from scrape_completions')]
    old_scrapes=(','.join(completion_rows)).split(',')
    for old_scrape in old_scrapes:
      scrape_ids.remove(old_scrape)
  else:
    completion_id=1
  d={
    "completion_id":completion_id
  , "scrape_ids":','.join(scrape_ids)
  }
  save(['completion_id'],d,'scrape_completions')
 def pop(self):
     query = select(
         '* from stack where rowid = (select max(rowid) from stack)')
     instantiate = "%s(%s)" % (query[0]['classname'], dumps(
         query[0]['url']))
     print instantiate
     obj = eval(instantiate)
     justpopped = obj
     return obj
Esempio n. 21
0
def main():
    rowid = int(select('max(id) as id from organic_operations')[0]['id'])
    print 'Starting on ' + str(rowid + 1)
    while True:
        rowid = rowid + 1
        done = parse(rowid)['done']
        #Python doesn't support recursion well because it does not support tail recursion elimination
        if done:
            break
def main():
  rowid=int(select('max(id) as id from organic_operations')[0]['id'])
  print 'Starting on '+str(rowid+1)
  while True:
    rowid=rowid+1
    done=parse(rowid)['done']
    #Python doesn't support recursion well because it does not support tail recursion elimination
    if done:
      break
Esempio n. 23
0
def step2():
    urls = [
        row['url'] for row in select(
            'url from step2completion where browsed=0 limit 1456')
    ]  #That seems to be near the CPU-time limit
    for url in urls:
        save_sidebar(url)
        #Then update step2completion
        execute('UPDATE step2completion SET browsed=1 WHERE url=?', url)
    def last(self):
        # Query
        query = select('* from main.stack where rowid = (select max(rowid) from main.stack)')

        # Load
        instantiate = "%s(%s)" % (query[0]['classname'], '"""' + query[0]['url'] + '"""')
        print instantiate
        obj = eval(instantiate)

        return obj
def nextid():
  defaultquery=[{"id":0}]
  if not OBS in show_tables():
    idquery=defaultquery
  else:
    idquery=select('max(id) as id from %s' % OBS)
    if len(idquery)==0:
      idquery=defaultquery
  id=idquery[0]['id']
  return id
def scrape(url,table_name="swdata", how_many = 10000):
  listurl=attendeelisturl(url)
  d=getattendeelist(listurl)
  d = getattendeelist(listurl + '&show_more=%d&sortid=0' % how_many)

  if table_name in show_tables():
    scraped_so_far=select('count(*) as "c" from `%s`'%table_name)[0]['c']
    saveattendeelist(d[0:-scraped_so_far],table_name)
  else:
    saveattendeelist(d,table_name)
Esempio n. 27
0
def nextid():
    defaultquery = [{"id": 0}]
    if not OBS in show_tables():
        idquery = defaultquery
    else:
        idquery = select('max(id) as id from %s' % OBS)
        if len(idquery) == 0:
            idquery = defaultquery
    id = idquery[0]['id']
    return id
Esempio n. 28
0
def geocode():
    if "scraped" not in show_tables():
        d = swimport('csv2sw').read.csv('https://views.scraperwiki.com/run/combine_mix_scraper_spreadsheets/')
        save([], d, 'scraped')

    if "address" not in show_tables():
        initialize()

    while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0:
        address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0]

        #print address
        if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0:
            d = all_services(address['address-input'])
            for row in d:
                row['address-input'] = address['address-input']
            save([], d, 'geocode')
        params = (address['address-column'], address['address-input'])
        execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params )
        commit()
Esempio n. 29
0
def scrape(url, table_name="swdata", how_many=10000):
    listurl = attendeelisturl(url)
    d = getattendeelist(listurl)
    d = getattendeelist(listurl + '&show_more=%d&sortid=0' % how_many)

    if table_name in show_tables():
        scraped_so_far = select('count(*) as "c" from `%s`' %
                                table_name)[0]['c']
        saveattendeelist(d[0:-scraped_so_far], table_name)
    else:
        saveattendeelist(d, table_name)
def geocode():
    if "scraped" not in show_tables():
        d = swimport('csv2sw').read.csv('https://views.scraperwiki.com/run/combine_mix_scraper_spreadsheets/')
        save([], d, 'scraped')

    if "address" not in show_tables():
        initialize()

    while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0:
        address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0]

        #print address
        if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0:
            d = all_services(address['address-input'])
            for row in d:
                row['address-input'] = address['address-input']
            save([], d, 'geocode')
        params = (address['address-column'], address['address-input'])
        execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params )
        commit()
def parsenames():
  d=select('`Lobbyist_Name` as "full_name" from `lobbyists`')
  for lobbyist in d:
    splitname=lobbyist['full_name'].split(', ')
    l=len(splitname)
    if l==2:
      lobbyist['last_name'],lobbyist['first_name']=splitname
    elif l==3:
      lobbyist['last_name'],lobbyist['suffix'],lobbyist['first_name']=splitname
    else:
      raise NameDelimiterError("This name has %d commas."%l-1)
  save([],d,'splitnames')
def main():
  for url in getUrls():
    slug=getScraperSlug(url)
    try:
      owners=getScraperOwners(slug)
    except:
      save(['url'],{"url":url},'errors')
    else:
      for owner in owners:
        save(['username'],{"username":owner},'users')
    save(['url'],{"url":url,"scraped":True},'urls')

  print 'Add bio html'
  if "`bio` TEXT" not in execute("SELECT sql FROM sqlite_master WHERE tbl_name = 'users' AND type = 'table'")['data'][0][0]:
    execute("ALTER TABLE `users` ADD COLUMN `bio` TEXT;")

  for username in getUsernames("bio"):
    bio=getUserProfile(username)
    save(['username'],{"username":username,"bio":bio},'users')

  print 'Add biotext'
  if "`biotext` TEXT" not in execute("SELECT sql FROM sqlite_master WHERE tbl_name = 'users' AND type = 'table'")['data'][0][0]:
    execute("ALTER TABLE `users` ADD COLUMN `biotext` TEXT;")

  for username in getUsernames("biotext"):
    bio=select('`bio` FROM `users` WHERE `username`=?',[username])[0]["bio"]
    biotext=getBioText(bio)
    save(['username'],{"username":username,"bio":bio,"biotext":biotext},'users')

  print 'Add code roles'
  if "`owns` INT" not in execute("SELECT sql FROM sqlite_master WHERE tbl_name = 'users' AND type = 'table'")['data'][0][0]:
    execute("ALTER TABLE `users` ADD COLUMN `owns` INT;")
    execute("ALTER TABLE `users` ADD COLUMN `edits` INT;")

  for username in getUsernames("owns"):
    d=getCodeRoles(username)
    execute("UPDATE `users` SET owns=?,edits=? WHERE username=?",[d["owns"],d["edits"],username])
    commit()

  print 'Add title variation'
  if "`distinct_title_tokens_count` INT" not in execute("SELECT sql FROM sqlite_master WHERE tbl_name = 'users' AND type = 'table'")['data'][0][0]:
    execute("ALTER TABLE `users` ADD COLUMN `distinct_title_tokens_count` INT;")
    execute("ALTER TABLE `users` ADD COLUMN `title_tokens` TEXT;")

  for username in getUsernames("distinct_title_tokens_count"):
    json=getUserJSON(username)
    d=titleVariation(json)
    execute("""
      UPDATE `users` SET distinct_title_tokens_count=?,title_tokens_count=?,title_tokens=?
      WHERE username=?;
      """,[d["distinct_count"],d["total_count"],d["text"],username]
    )
    commit()
def check_identical_screenshot(image_base64):
    """Check whether there's an identical screenshot already saved"""

    #If,else to handle new tables
    if 'images' in show_tables():
        identical_screenshot = select(
            'screenshot_id from images where image="' + image_base64 +
            '" limit 1')
    else:
        identical_screenshot = []

    if len(identical_screenshot) == 0:
        #No identical screenshot
        if 'images' in show_tables():
            screenshot_id = select(
                'max(screenshot_id) as id from images')[0]['id'] + 1
        else:
            screenshot_id = 1
        return (False, {"screenshot_id": screenshot_id, "image": image_base64})
    elif len(identical_screenshot) == 1:
        return (True, identical_screenshot[0])
def check_identical_screenshot(image_base64):
  """Check whether there's an identical screenshot already saved"""

  #If,else to handle new tables
  if 'images' in show_tables():
    identical_screenshot=select('screenshot_id from images where image="'+image_base64+'" limit 1')
  else:
    identical_screenshot=[]

  if len(identical_screenshot)==0:
    #No identical screenshot
    if 'images' in show_tables():
      screenshot_id=select('max(screenshot_id) as id from images')[0]['id']+1
    else:
      screenshot_id=1
    return (False,{
      "screenshot_id":screenshot_id
    , "image":image_base64
    })
  elif len(identical_screenshot)==1:
    return (True,identical_screenshot[0])
def geocode():
    if "scraped" not in show_tables():
        d = swimport('csv2sw').read.csv('http://hacks.thomaslevine.com/all.csv')
        save([], d, 'scraped')
        execute('DELETE FROM `scraped` WHERE `Country` != "South Africa"')
        commit()

    if "address" not in show_tables():
        initialize()

    while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0:
        address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0]

        #print address
        if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0:
            d = all_services(address['address-input'])
            for row in d:
                row['address-input'] = address['address-input']
            save([], d, 'geocode')
        params = (address['address-column'], address['address-input'])
        execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params )
        commit()
def atomic():
  if "client"==pagetype(get_var('previous_href')):
    table_names=CLIENT_TABLES
  elif "lobbyist"==pagetype(get_var('previous_href')):
    table_names=LOBBYIST_TABLES
  else:
    raise ResumeError('The type of the previous href, "%s", could not be determined.' % get_var('previous_href'))

  if "clients_urls" in show_tables():
    sourceUrl=select('distinct sourceUrl as "s" from `clients_urls` where jobId=(select max(jobId) from `clients_urls`)')[0]['s']
    for table_name in table_names:
      execute('DELETE FROM `%s` where jobId in (select jobId from clients_urls where sourceUrl="%s")' % (table_name,sourceUrl))
    commit()
    return sourceUrl
def main(mode="initial"):
    end = 10000 # seems like most recent from http://www.historic.org.nz/en/TheRegister/RecentReg.aspx
    if mode == "initial":
        accessed = [p['place_id'] for p in store.select("place_id FROM _places_accessed")]
        #start = max(accessed)        
        for ref in xrange(end):
            if ref not in accessed:
                print ref
                do_place(ref)

    elif mode == "crawl":
        start = 0
        for ref in xrange(start, end):
            do_place(ref)
def go(number=1,pagetype="SCRAPERS"):
  foo=scrapepage(number,pagetype)
  is_end=('scraper_urls' in show_tables()) and (foo['lasturl'] in select('url from scraper_urls'))
  #Save after checking whether it's the end because that's how I check.
  save(['url'],foo['scraper_urls'],'scraper_urls')

  if foo['lastpage']:
    #End when we reach the last page
    print "I scraped all the scrapers!"
  elif is_end:
    #End when we reach page where a scraper has already been scraped
    print "I scraped all of the new scrapers!"
  else:
    go(number+1,pagetype)
def get_page(url,table_name="pages"):
  if not table_name in show_tables():
    raise PageNotSavedError(url)
  else:
    rows=select("`text` from %s where url=?" % table_name,[url])
    l=len(rows)
    if l==0:
      raise PageNotSavedError(url)
    elif l>1:
      raise DatastoreError(url,"Multiple rows match this url.")
    elif l==1:
      if not 'text' in rows[0].keys():
        raise DatastoreError(url,"The database does not have a `text` column.")
      else:
        return rows[0]['text']
Esempio n. 40
0
def main():
  if not 'cities_done' in show_tables():
    cities_done=[]
  else:
    cities_done=select('* from cities_done')

  for fromcity in CITIES_NY:
    for tocity in CITIES_NY:
      if fromcity==tocity:
        print 'Skipping within-%s route' % fromcity
      elif {"from":fromcity,"to":tocity} in cities_done:
        print 'Already scraped %s to %s' % (fromcity,tocity)
      else:
        grab(fromcity,"NY",tocity,"NY")
        save([],{"from":fromcity,"to":tocity},'cities_done')
def moreparsing_map():
  "Map along the most recent results in the table (like a Couch map) and return a new one"
  d=select("* FROM `swdata` WHERE date_scraped=(SELECT max(date_scraped) from `swdata`);")
  for row in d:
    row['street-address'],row['postal-code']=splitAddress(row['Address_'])
    row['town']=extractTown(row['branchName'])
  if 'final' in show_tables():
    execute('DROP TABLE `final`;')

  d_final = []
  for row in d:
    if row['regionName'] not in ["Botswana", "Malawi", "Nambia"]:
      d_final.append(row)

  save([],d_final,'final')
def separate_addresses():
  execute('DROP TABLE IF EXISTS final')
  commit()
  d=select('* from `initial`')
  for row in d:
    splitaddress=row['address'].split('\n')
    l=len(splitaddress)
    if l==3:
      row['street-address'],row['subtown'],row['town2']=splitaddress
    elif l==2:
      row['street-address'],row['subtown']=splitaddress
    else:
      raise AddressError
    row['street-address'] = row['street-address'].strip()
    row['address'] = strip_address(row['address'])
  save([],d,'final')
Esempio n. 43
0
def separate_addresses():
    execute('DROP TABLE IF EXISTS final')
    commit()
    d = select('* from `initial`')
    for row in d:
        splitaddress = row['address'].split('\n')
        l = len(splitaddress)
        if l == 3:
            row['street-address'], row['subtown'], row['town2'] = splitaddress
        elif l == 2:
            row['street-address'], row['subtown'] = splitaddress
        else:
            raise AddressError
        row['street-address'] = row['street-address'].strip()
        row['address'] = strip_address(row['address'])
    save([], d, 'final')
Esempio n. 44
0
def cp1():
  execute('''
CREATE TABLE IF NOT EXISTS `businessPremises` (
  `date_scraped` REAL,
  `businessPremisesURL` TEXT,
  FOREIGN KEY (date_scraped, businessPremisesUrl)
  REFERENCES cp1(date_scraped, businessPremisesUrl)
)
''')

  if get_var('crashed') == 1:
    pagenum = select('max(pagenum) from cp1 where date_scraped = (select max(date_scraped) from cp1)')[0]['max(pagenum)']
    print "Resuming from page %d" % pagenum
    p = Page('CP1')
    p = Page('CP1', s=p.s, pagenum=pagenum)
  else:
    print "Starting a new run"
    p = Page('CP1')

  while p.lastpage()==False:
    print "Beginning page %d" % p.pagenum
    tables=p.table().subtables()
    d = []
    for table in tables:
        row = table.parse()
        row['businessPremisesURL'] = table.business_premises_url()

        try:
            business_premises_data, more_registrant_data = table.business_premises(p.s)
        except Exception, msg:
            print "Error on %s: msg" % table.business_premises_url()
            sleep(60)
            print "Trying again"
            business_premises_data, more_registrant_data = table.business_premises(p.s)

        row['date_scraped']=DATE
        row['pagenum']=p.pagenum
        row['url']=URL+"?page=%d"%p.pagenum

        row.update(more_registrant_data)

        save([], business_premises_data, 'businessPremises')
        save(['date_scraped', 'businessPremisesURL'],row,'cp1')

        sleep(1)
    save_var('crashed', 1)
    p=p.next25()
def parse(url, xml=None, suffix=''):
    if xml == None:
        xml = pull(url)
    print "Loading the page"
    scrapers = xml.xpath(PATH)
    for scraper in scrapers:
        if 'observations' in show_tables():
            observation_id = select(
                'max(observation_id) as id from observations')[0]['id'] + 1
        else:
            observation_id = 1
        identifiers = {"observation_id": observation_id}
        info = copy(identifiers)
        screenshot_identity = copy(identifiers)

        identifiers['time_scraped'] = time()
        identifiers['url'] = scraper.xpath('a')[0].attrib['href']

        print "Extracting metadata"
        info['owner'], info['title'] = scraper.xpath('a/h4')[0].text.split(
            '/', 1)
        info['language'], info['type'] = re.split(
            r'[^a-zA-Z]+',
            scraper.xpath('a/span[@class="about"]')[0].text)
        info['created'] = scraper.xpath('a/span[@class="when"]')[0].text

        screenshot_identity['url'] = scraper.xpath('a/img')[0].attrib['src']
        print "Checking whether I've already saved the screenshot"
        exists, image = check_identical_screenshot(
            getimage(screenshot_identity['url']))
        if exists:
            #If I have, don't do anything with theimage
            print "Screenshot already saved"
        else:
            #If I haven't, save a new image
            print "Saving the new screenshot"
            image['observation_scraped_on'] = observation_id
            save(['observation_scraped_on', 'screenshot_id'], image, 'images')

        #Either way, link the observation to the saved image
        screenshot_identity['screenshot_id'] = image['screenshot_id']
        save(['observation_id'], screenshot_identity, 'screenshot_identidies')

        #Save these at the end to avoid partial rows
        print "Saving"
        save(['observation_id'], info, 'homepage_metadata')
        save(['observation_id'], identifiers, 'observations')