def scraperwiki():
  #zipcodes=get_zips(urlopen(URLS['zips']))
  attach('us_zip_codes')
  zipcodes=[str(row['zip']) for row in select('zip from zipcodes')]

  #Skip zipcodes that are already finished.
  try:
    finished_zipcodes=[row['zipcode'] for row in select('zipcode from finished_zipcodes')]
  except:
    pass
  else:
    #print 'Already scraped these zipcodes:'
    for zipcode in finished_zipcodes:
      try:
        zipcodes.remove(zipcode)
      except ValueError:
        #The zipcodes database isn't complete
        pass

  for zipcode in zipcodes:
    print 'Scraping '+zipcode
    lastpage=int(get_lastpage(search(zipcode,'1',save=False)))
    for page in [str(p) for p in range(1,lastpage+1)]:
      theaters=get_theaters(zipcode,page,save=False)
      for theater in theaters:
        info=theater_info(theater)
        info=clean_info(info)
        save(['url'],info2dictRow(info,zipcode),'locations')
      sleep(INTERVAL)
    save(['zipcode'],{'zipcode':zipcode},'finished_zipcodes')
def scraperwiki():
    #zipcodes=get_zips(urlopen(URLS['zips']))
    attach('us_zip_codes')
    zipcodes = [str(row['zip']) for row in select('zip from zipcodes')]

    #Skip zipcodes that are already finished.
    try:
        finished_zipcodes = [
            row['zipcode'] for row in select('zipcode from finished_zipcodes')
        ]
    except:
        pass
    else:
        #print 'Already scraped these zipcodes:'
        for zipcode in finished_zipcodes:
            try:
                zipcodes.remove(zipcode)
            except ValueError:
                #The zipcodes database isn't complete
                pass

    for zipcode in zipcodes:
        print 'Scraping ' + zipcode
        lastpage = int(get_lastpage(search(zipcode, '1', save=False)))
        for page in [str(p) for p in range(1, lastpage + 1)]:
            theaters = get_theaters(zipcode, page, save=False)
            for theater in theaters:
                info = theater_info(theater)
                info = clean_info(info)
                save(['url'], info2dictRow(info, zipcode), 'locations')
            sleep(INTERVAL)
        save(['zipcode'], {'zipcode': zipcode}, 'finished_zipcodes')
def go():
  attach('new_mexico_state_audits')
  nodeIds=[row['nodeId'] for row in select('nodeId from nodeIds')]
  scraped_nodeIds=[row['nodeId'] for row in select('nodeId from opinions order by time_scraped')] #So you get different information from consecutive partial runs.
  for nodeId in scraped_nodeIds:
    nodeIds.remove(nodeId)
  if len(nodeIds)==0:
    nodeIds=scraped_nodeIds
  for nodeId in nodeIds:
    print 'Scraping node '+nodeId
    parse(nodeId)
def test_wrapper(test_func):
  attach('scraperwiki_events_eventbrite_guestlists')
  original_data=select('* from `ny` where `Twitter Handle`="thomaslevine";')

  table_name=test_func(original_data)
  sleep(5)

  attach('ajax_tractor')
  ajaxed_data=select('* from `%s` where `Twitter Handle`="thomaslevine";' % table_name)
  print original_data,ajaxed_data
  print original_data==ajaxed_data
Example #5
0
def go():
    attach('new_mexico_state_audits')
    nodeIds = [row['nodeId'] for row in select('nodeId from nodeIds')]
    scraped_nodeIds = [
        row['nodeId']
        for row in select('nodeId from opinions order by time_scraped')
    ]  #So you get different information from consecutive partial runs.
    for nodeId in scraped_nodeIds:
        nodeIds.remove(nodeId)
    if len(nodeIds) == 0:
        nodeIds = scraped_nodeIds
    for nodeId in nodeIds:
        print 'Scraping node ' + nodeId
        parse(nodeId)
def copyUrlsDb():
    attach('scraperwiki_scraper_urls')
    save(['url'],select('url,0 as "scraped" from scraper_urls'),'urls')
#!/usr/bin/env python
from urllib2 import urlopen
from lxml.html import fromstring
from lxml.etree import tostring
from scraperwiki.sqlite import save,attach,select
from BeautifulSoup import BeautifulSoup

attach('alafia_network')
pages=select('* from html')

def main():
  for page in pages:
    number=page['page']

    #Replace brs with a better delimeter
    html=page.pop('table').replace('<br />','|')
    text=''.join(BeautifulSoup(html).findAll(text=True))

    for row in text.split('=-=-=-=-=-=-=-=-=-=-=-=-=-=-='):
      for column in row.split('|'):
        cell=column.split(':')
        if len(cell)==2:
          key,value=cell
          key=key.encode('ascii','ignore').replace(' ','')
          if '&#' in key:
            key=key.split('&#')[0]
          page[key]=value
    save(['page'],page,'microfinance_institutions')

main()#!/usr/bin/env python
from urllib2 import urlopen
import scraperwiki
import scraperwiki.sqlite as sql

sql.attach("aeso_current_supply_demand_report", "aeso")
sql.attach("alberta_wind_farms", "wind")

data = sql.select("aeso.swdata.name, wind.swdata.lat, wind.swdata.lon, aeso.swdata.mc, aeso.swdata.tng, wind.swdata.turbineType FROM aeso.swdata LEFT JOIN wind.swdata ON aeso.swdata.name = wind.swdata.name")

print data

sql.save(unique_keys=["name"], data=data)
import scraperwiki
import scraperwiki.sqlite as sql

sql.attach("aeso_current_supply_demand_report", "aeso")
sql.attach("alberta_wind_farms", "wind")

data = sql.select("aeso.swdata.name, wind.swdata.lat, wind.swdata.lon, aeso.swdata.mc, aeso.swdata.tng, wind.swdata.turbineType FROM aeso.swdata LEFT JOIN wind.swdata ON aeso.swdata.name = wind.swdata.name")

print data

sql.save(unique_keys=["name"], data=data)
# TO DO
# * Add some error metric, probably based on latitude and longitude.
# * Consider place type.
# * Run more data.
import operator
from scraperwiki.sqlite import attach, select, get_var, save_var, save, execute, commit
from urllib2 import urlopen
from urllib import urlencode
from json import loads, dumps
attach('capitec_bank')
from time import sleep
from unidecode import unidecode

GEOCODE_URL = 'http://open.mapquestapi.com/nominatim/v1/search?format=json&json_callback=&%s'

COLUMNS = [
  # Null
#  'address',

  # Main effects
#  'branchName',
#  'provinceName',

  # Second-order
#  'branchName || ", South Africa"',
  'provinceName || ", South Africa"',
#  'branchName || ", " || provinceName',

  # All
#  'branchName || ", " || provinceName || ", South Africa"'
]
Example #10
0
# TO DO
# * Add some error metric, probably based on latitude and longitude.
# * Consider place type.
# * Run more data.
import operator
from scraperwiki.sqlite import attach, select, get_var, save_var, save, execute, commit
from urllib2 import urlopen
from urllib import urlencode
from json import loads, dumps
attach('capitec_bank')
from time import sleep
from unidecode import unidecode

GEOCODE_URL = 'http://open.mapquestapi.com/nominatim/v1/search?format=json&json_callback=&%s'

COLUMNS = [
    # Null
    #  'address',

    # Main effects
    #  'branchName',
    #  'provinceName',

    # Second-order
    #  'branchName || ", South Africa"',
    'provinceName || ", South Africa"',
    #  'branchName || ", " || provinceName',

    # All
    #  'branchName || ", " || provinceName || ", South Africa"'
]
def initialize():
    execute("""
CREATE TABLE `address` (
  `address-column` text,
  `address-input` text,
  `finished` integer
)""")
    execute("CREATE UNIQUE INDEX column ON `address` (`address-column`,`address-input`);")

    try:
        PREVIOUS_SCRAPER
    except:
        execute("""
    CREATE TABLE `geocode` (
      `address-geocode` TEXT,
      `latitude-geocode` REAL,
      `longitude-geocode` REAL,
    
      `number-of-matches` text,
    
      `service` TEXT,
      `address-input` TEXT
    )""")
    else:
        attach(PREVIOUS_SCRAPER)
        save([], select('* FROM `geocode`'), 'geocode')

    execute("CREATE UNIQUE INDEX geocode_key ON `geocode` (`address-input`, `service`);")

    execute('''
CREATE TABLE `branch_address` (
  `address-column` TEXT,
  `address-input` TEXT,
  `entityRecord` INTEGER,
  FOREIGN KEY(entityRecord) REFERENCES scraped(rowid)
)''')
    execute("CREATE UNIQUE INDEX branch_key ON `branch_address` (`entityRecord`, `address-column`);")
    commit()

    for column in COLUMNS:
        execute('''
INSERT INTO `address` (
  `address-column`,
  `address-input`,
  `finished`
) SELECT DISTINCT
    ? as "address-column",
    %s as "address-input",
    0 as "finished"
  FROM
    `scraped`
  ''' % COLUMNS[column], column)
        commit()
        execute('''
INSERT INTO `branch_address` (
  `entityRecord`,
  `address-column`,
  `address-input`
) SELECT
    `rowid`, 
    ? as "address-column",
    %s as "address-input"
  FROM
    `scraped`
  ''' % COLUMNS[column], column)
        commit()
    execute('DELETE FROM `address` WHERE `address-input` IS NULL')
    commit()
    execute('''
UPDATE address SET finished = 1
WHERE rowid in (
  select address.rowid from `address`
  join `geocode` on 
  address.`address-input` = geocode.`address-input` where service = "nominatim"
)
''')
    commit()
Example #12
0
def initialize():
    execute("""
CREATE TABLE `address` (
  `address-column` text,
  `address-input` text,
  `finished` integer
)""")
    execute("CREATE UNIQUE INDEX column ON `address` (`address-column`,`address-input`);")
    execute("""
CREATE TABLE `geocode` (
  `address-geocode` TEXT,
  `latitude-geocode` REAL,
  `longitude-geocode` REAL,

  `number-of-matches` text,

  `service` TEXT,
  `address-input` TEXT
)""")
    execute("CREATE UNIQUE INDEX geocode_key ON `geocode` (`address-input`, `service`);")

    execute('''
CREATE TABLE `branch_address` (
  `address-column` TEXT,
  `address-input` TEXT,
  `entityRecord` INTEGER,
  FOREIGN KEY(entityRecord) REFERENCES scraped(rowid)
)''')
    execute("CREATE UNIQUE INDEX branch_key ON `branch_address` (`entityRecord`, `address-column`);")
    commit()


    for column in COLUMNS:
        execute('''
INSERT INTO `address` (
  `address-column`,
  `address-input`,
  `finished`
) SELECT DISTINCT
    ? as "address-column",
    %s as "address-input",
    0 as "finished"
  FROM
    `scraped`
  ''' % COLUMNS[column], column)
        commit()
        execute('''
INSERT INTO `branch_address` (
  `entityRecord`,
  `address-column`,
  `address-input`
) SELECT
    `rowid`, 
    ? as "address-column",
    %s as "address-input"
  FROM
    `scraped`
  ''' % COLUMNS[column], column)
        commit()
    execute('DELETE FROM `address` WHERE `address-input` IS NULL')

    try:
        PREVIOUS_SCRAPER
    except:
        pass
    else:
        attach(PREVIOUS_SCRAPER)
        execute('INSERT INTO main.`geocode` SELECT * FROM `geocode`')
    commit()
def main():
  attach('canada_elections_contributer_table')
  for qs in select('querystring from contributions where querystring not in (select querystring from contributors)'):
    popup(qs)
Example #14
0
from scraperwiki.sqlite import attach, select, get_var, save_var, save, execute, commit, show_tables
from urllib2 import urlopen
from urllib import urlencode
from json import loads, dumps
from time import sleep
from unidecode import unidecode
from geopy import geocoders, distance

attach("capitec_bank")

COLUMNS = {
  # Full address
  'full-address': 'address',

  # Town-country
  'town-country': 'branchName || ", South Africa"',

  # Province-country
  'province-country': 'provinceName || ", South Africa"',

  # Town-province-country
  'town-province-country': 'branchName || ", " || provinceName || ", South Africa"',

  # Postcode-country
  'postcode-country': 'postcode || ", " || "South Africa"'
}

def geocode():
    if "address" not in show_tables():
        initialize()
from cStringIO import StringIO

from scraperwiki import dumpMessage, sqlite, utils

sourcescraper = 'solid_state_drives'

sqlite.attach(sourcescraper, "src")
data = sqlite.select('* from src.swdata')

from pprint import pprint

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

xs, ys, ys_div = [], [], []
for element in data:
    if not element["size"]: continue
    xs.append(float(element["price"]))
    ys.append(float(element["size"][:-2]))
    ys_div.append(xs[-1] / ys[-1])

subplot = plt.subplot(211)
plt.xlabel(u"Price (GBP)")
plt.ylabel(u"Size (GB)")
subplot.scatter(xs, ys)

subplot = plt.subplot(212)
plt.xlabel(u"Price (GBP)")
plt.ylabel(u"Price / Size (GB)")
subplot.scatter(xs, ys_div)
Example #16
0
#from urllib import urlencode
from json import loads
from urllib2 import urlopen
from scraperwiki.sqlite import attach, select

attach('nyc_lobbyist_directory_browser')

QUERIES = {
    "summary":
    """
    SELECT
      sum(reimbursement) AS "Total Reimbursement"
    , sum(compensation) AS "Total Compensation"
    FROM
      `clients_details`
    WHERE
      `detailId`="Total"
    """,
    "completed":
    """
    SELECT
      count(`href`) AS "completed"
    FROM
      `links`
    WHERE
      `href`<(
        SELECT
          `value_blob`
        FROM
          `swvariables`
        WHERE
import scraperwiki
import scraperwiki.sqlite as sql

sql.attach("aeso_current_supply_demand_report", "aeso")
sql.attach("alberta_wind_farms", "wind")

data = sql.select(
    "aeso.swdata.name, wind.swdata.lat, wind.swdata.lon, aeso.swdata.mc, aeso.swdata.tng, wind.swdata.turbineType FROM aeso.swdata LEFT JOIN wind.swdata ON aeso.swdata.name = wind.swdata.name"
)

print data

sql.save(unique_keys=["name"], data=data)
import scraperwiki
import scraperwiki.sqlite as sql

sql.attach("aeso_current_supply_demand_report", "aeso")
sql.attach("alberta_wind_farms", "wind")

data = sql.select(
    "aeso.swdata.name, wind.swdata.lat, wind.swdata.lon, aeso.swdata.mc, aeso.swdata.tng, wind.swdata.turbineType FROM aeso.swdata LEFT JOIN wind.swdata ON aeso.swdata.name = wind.swdata.name"
)

print data

sql.save(unique_keys=["name"], data=data)
try:
    from scraperwiki.utils import httpresponseheader
    from scraperwiki.sqlite import attach, select
except ImportError:
    def httpresponseheader(a, b):
        pass
from lxml.html import fromstring
from urllib2 import urlopen
from time import time

attach('combine_mix_scraper_spreadsheets_1')
httpresponseheader("Content-Type", "text/csv")
httpresponseheader("Content-Disposition", "attachment; filename=combined_spreadsheets.csv")
print select('spreadsheet from combined_spreadsheets where time = (select max(time) from combined_spreadsheets)')[0]['spreadsheet']try:
    from scraperwiki.utils import httpresponseheader
    from scraperwiki.sqlite import attach, select
except ImportError:
    def httpresponseheader(a, b):
        pass
from lxml.html import fromstring
from urllib2 import urlopen
from time import time

attach('combine_mix_scraper_spreadsheets_1')
httpresponseheader("Content-Type", "text/csv")
httpresponseheader("Content-Disposition", "attachment; filename=combined_spreadsheets.csv")
print select('spreadsheet from combined_spreadsheets where time = (select max(time) from combined_spreadsheets)')[0]['spreadsheet']
Example #19
0
    return results


'''
    all_quotes = re.compile(r'"([^"]*)[,.?!]"')
    for value in all_quotes.findall(text):
        quote = {
            'name': '',
            'quote': value
        }
        results.append(quote)
        print "a quote" + value
'''

attach('feedzilla')
articles = select('* from feedzilla.swdata limit 50')

for article in articles:
    quotes = extract_quotes(article['text'])
    if not quotes:
        continue
    for quote in quotes:
        record = {
            'url': article['url'],
            'date': article['date'],
            'name': quote['name'],
            'quote': quote['quote']
        }
        save([], record)
Example #20
0
#!/usr/bin/env python
from urllib2 import urlopen
from lxml.html import fromstring
from lxml.etree import tostring
from scraperwiki.sqlite import save, attach, select
from BeautifulSoup import BeautifulSoup

attach('alafia_network')
pages = select('* from html')


def main():
    for page in pages:
        number = page['page']

        #Replace brs with a better delimeter
        html = page.pop('table').replace('<br />', '|')
        text = ''.join(BeautifulSoup(html).findAll(text=True))

        for row in text.split('=-=-=-=-=-=-=-=-=-=-=-=-=-=-='):
            for column in row.split('|'):
                cell = column.split(':')
                if len(cell) == 2:
                    key, value = cell
                    key = key.encode('ascii', 'ignore').replace(' ', '')
                    if '&#' in key:
                        key = key.split('&#')[0]
                    page[key] = value
        save(['page'], page, 'microfinance_institutions')

    all_quotes = re.compile(r'"([^"]*)[,.?!]"')
    for value in all_quotes.findall(text):
        quote = {
            'name': '',
            'quote': value
        }
        #results.append(quote)
        #print "a quote" + value
    return results


print extract_quotes('"yes," declared Alex.')


attach('feedzilla')
articles = select('* from feedzilla.swdata limit 20')
    
for article in articles:
    quotes = extract_quotes(article['text'])
    if not quotes:
        continue
    for quote in quotes:
        record = {
            'url': article['url'],
            'date': article['date'],
            'name': quote['name'],
            'quote': quote['quote']
        }
        print record
        #save([], record)
#from urllib import urlencode
from json import loads
from urllib2 import urlopen
from scraperwiki.sqlite import attach,select
attach('nyc_lobbyist_directory_browser')

QUERIES={
  "summary":"""
    SELECT
      sum(reimbursement) AS "Total Reimbursement"
    , sum(compensation) AS "Total Compensation"
    FROM
      `clients_details`
    WHERE
      `detailId`="Total"
    """
, "completed":"""
    SELECT
      count(`href`) AS "completed"
    FROM
      `links`
    WHERE
      `href`<(
        SELECT
          `value_blob`
        FROM
          `swvariables`
        WHERE
          `name`="previous_href"
      )
  """
from urllib2 import urlopen
from scraperwiki import swimport
from scraperwiki.sqlite import save,select,attach

URL="http://sw.thomaslevine.com/disclosures.csv"
attach('open_book_new_york')

def load_disclosures():
  csv=urlopen(URL)
  d=swimport('csv2sw').csv2json(csv)
  save([],d,'disclosures')

def join():
  disclosures=select('Entity,upper(Entity) as "ENTITY" from disclosures where entity is not null')
  disclosures_cleaned=[{
    "raw":row['Entity']
  , "clean":remove_ny(row['ENTITY']).strip()
  } for row in disclosures]
  save([],disclosures_cleaned,'disclosures_cleaned')


  licenses=select('Vendor,upper(Vendor) as "VENDOR" from swdata where Vendor is not null')
  licenses_cleaned=[{
    "raw":row['Vendor']
  , "clean":remove_ny(row['VENDOR']).strip()
  } for row in licenses]
  save([],licenses_cleaned,'licenses_cleaned')

def remove_ny(string):
  for ny in ("N.Y.S.","NYS","NEW YORK STATE","NEW YORK","N.Y.","NY"):
    string=string.replace(ny,'')