def parse(rowid):
  d=grab(rowid)
  if len(d['rows'])>0:
    row=d['rows'][0]
    row['id']=rowid
    save(['id'],row,'organic_operations')
  return {"done":d['page']>=d['records']}
def saveFilingTable(url):
    #scrape the url
    raw=scrape(url)
    
    #parse the scrape
    parsed = fromstring(raw)
    
    #find the target table
    trgTable = parsed.cssselect("table")[2]
    
    #extract all the rows
    rows = trgTable.cssselect("tr")
    
    #loop through each row
    for row in rows[1:]:
        cells = row.cssselect("td,th")
        cellcontents = [cell.text_content() for cell in cells]
        data = dict(zip(COLNAMES,cellcontents))
        data['num_workers'] = int(data['num_workers'])
        data['state'] = data['location'].strip()[0:2]
        if data['state'] not in STATES:
            data['state'] = 'unknown'
        
        v = map(int,data['expiration_date'].split('-'))
        data['expiration_date'] = datetime.date(v[2]+2000,v[0],v[1])

        save([],data)
def main(testing = False):
  out = ''
  x = fromstring(urlopen('https://views.scraperwiki.com/run/mix_scraper_spreadsheets/?date='+str(time())).read())
  csv_links = x.xpath('//td[position()=3]/a/@href')

  if testing:
    csv_links = csv_links[0:2]

  #Manual data
  csv_links.append('http://hacks.thomaslevine.com/manual-SA-data-cleaned.csv')

  #Standard Bank data, which was run with Highwall instead of ScraperWiki
  csv_links.append('http://hacks.thomaslevine.com/standardbank-branches-cleaned.csv')
  csv_links.append('http://hacks.thomaslevine.com/standardbank-atm.csv')

  header0, body = getCsv(csv_links[0])
  out += header0[:-2]
  for csv_link in csv_links:
    header, body = getCsv(csv_link)

    if header0 == header:
      out += body[:-2]
    else:
      header_pairs = zip(header0.split(','), header.split(','))
      for pair in header_pairs:
        if pair[0] != pair[1]:
          print pair
      raise ValueError("Headers from %s and %s don't match." % (csv_links[0], csv_link))
  save(['time'], {"time": time(), "spreadsheet": out}, 'combined_spreadsheets')
Ejemplo n.º 4
0
def main():
  d=[]
  for href in get_office_type_hrefs():
    d.extend(get_office_info(href))
  for row in d:
    row['date_scraped']=DATE
  save([],d,'final')
 def record_error(self,error_detail,error_at):
   save([],{
     "request":dumps(self.r.request.data)
   , "request_content":self.r.content
   , "error_detail":error_detail
   , "error_at":error_at
   },'errors')
def search_directory_tree(id,js='getlaw("LAWS","","MAIN")',level=1):
  try:
    sleep(INTERVAL)
#    print 'Searching for %s on level %d' % (js,level)
    save_if_menu(id,js)
    foo,bar,baz=(eval(js.replace('getlaw','')))
    raw=getlaw(foo,bar,baz)
    xml=fromstring(raw)
    links=get_law_links(xml,js)
    if 0==len(links):
      #If there aren't any links, we've reached the lowest level
      save_raw_text(id,raw,time())
      save_law_text(id,xml,time())
      save_state(js,level)
    else:
      #If there are links, save them and descend into them in a depth-first fashion
      #There will only be five levels of recursion, so this is okay for Python even though it doesn't support TRE
      for link in links:
        link['observation']['parentjs']=parentjs
        link['observation']['level']=level
      save(['id'],[link['meta'] for link in links],META)
      save(['id'],[link['observation'] for link in links],OBS)
      save_state(js,level)
      for link in links:
        nextpage=link['observation']['js']
        search_directory_tree(nextid(),nextpage,level+1)
  except:
    log_error(js=js)
    raise
def save_raw_text(id,raw,date_scraped):
  d={
    "id":id
  , "rawtext":raw
  , "date_scraped":date_scraped
  }
  save(['id'],d,'law_text')
def main():
  if get_var('columns_to_do') == None:
    columns = COLUMNS
  else:
    columns = loads(get_var('columns_to_do'))

  while len(columns) > 0:
    column = columns[0]
    d = load_data(column)
    out = []
    for row in d:
      p = Place(row[column], (row['latitude'], row['longitude']) )
      row_geocode = p.geocode()
      row_geocode.update({
        "address-column":column,
        "branchId": row['branchId']
      })
      out = row_geocode
      sleep(3)
      save([], out, 'geocoded')
    columns.remove(column)

    if len(columns) == 0:
      save_var('columns_to_do',None)
    else:
      save_var('columns_to_do',dumps(columns))
 def geocode(this):
   this.extract_location()
   for location in this.locations:
     try:
       locs_geo=G.geocode(location,exactly_one=False)
     except geocoders.google.GQueryError:
       pass
     except:
       #You didn't see anything
       pass
     else:
       exact=len(locs_geo)==1
       if not exact:
         indices=range(len(locs_geo))
         indices.reverse()
         for i in indices:
           #print 'Skipping %s' % locs_geo[i][0]
           if 'Egypt' not in locs_geo[i][0]:
             locs_geo.pop(i)
       for loc in locs_geo:
         location_geo,(latitude,longitude)=loc
         save([],{
           "tweet_id":this._tweet['id']
         , "place_raw":location
         , "place_geo":location_geo
         , "latitude":latitude
         , "longitude":longitude
         , "exact":exact
         },'geocode')
def parse_pdf_header(page_data):
    sc_data_agg = []
    nums = range(5, 16)
    for num in nums:
        sc_data = page_data.cssselect('text')[num].text_content()
        match = re.search(r':', sc_data)
        if match:
            sc_data = sc_data.split(':', 1)[1].strip()
        sc_data_agg += [sc_data]
        
    data = dict(zip(variables, sc_data_agg))
    
    if data['address'] != '':
        us = geocoders.GeocoderDotUS() 
        place, (lat, lng) = us.geocode(data['address'])  
        print "%s: %.5f, %.5f" % (place, lat, lng) 
    
        data['lat'] = lat
        data['lng'] = lng
    

    save([], data)


    print data
def scrape_table(url):
    download = urlopen(url)
    raw = download.read()
    
    html = fromstring(raw)
    table = html.cssselect('table')[2]
    
    #print [th.text_content() for th in table.cssselect('th')]
    
    for tr in table.cssselect('tr')[1:]:
        cell_text = [td.text_content() for td in tr.cssselect('td')]
        data = dict(zip(COLUMN_NAMES, cell_text))
        data['num_workers'] = int(data['num_workers'])
        if data['location'][:2] in STATES:
            data['state'] = data['location'][:2]
        data['expiration_date'] = datetime.datetime.strptime(
            data['expiration_date'],
            '%m-%d-%y').date()
        a_elements = tr.cssselect('a')
        if len(a_elements) > 1:
            raise ValueError('Row has multiple a tags.')
        elif len(a_elements) == 1:
            data['pdf'] = 'http://www.dol.gov/olms/regs/compliance/cba/' + a_elements[0].attrib['href']
        elif len(a_elements) == 0:
            pass
        
        #print data
        save([], data)
def parse_and_save(root):
    global podcasts_count
    links = root.xpath('//div[@class="ContentTabla"]/ul/li')[1:]
    
    for link in links:
        url = 'http://www.rtve.es' + link.xpath('span[@class="col_tit"]/a/@href')[0]
        titulo = link.xpath('span[@class="col_tit"]/a/text()')[0].encode('latin-1')

        # A algunos les falta el botón de descarga, pero el mp3 parece que sí está (ej.: pg 9)
        # http://www.rtve.es/alacarta/audios/carne-cruda/carne-cruda-paralisis-permanente-revive-07-03-12/1342911/
        try:
            url_mp3 = 'http://www.rtve.es' + link.xpath('span[@class="col_tip"]/a/@href')[0]
        except IndexError:
            print 'WARNING: Download not available:', url
            url_mp3 = None

        tipo = "".join(link.xpath('span[@class="col_tip"]/text()')).strip()
        duracion = link.xpath('span[@class="col_dur"]/text()')[0]
        popularidad = link.xpath('span[@class="col_pop"]/span/em/strong/span/text()')[0]
        fecha = link.xpath('span[@class="col_fec"]/text()')[0]
        desc_corta = link.xpath('div//span[@class="detalle"]/text()')[0].encode('latin-1')

        save([], {'titulo':titulo, 'url':url, 'url_mp3':url_mp3, 'tipo':tipo, 'duracion':duracion, 'popularidad':popularidad, 'fecha':fecha, 'descripcion_corta':desc_corta})
        print '%s: %s' %(fecha, titulo)
        podcasts_count = podcasts_count +1
def justice_generator(html, current):
    pos=0
    while pos < len(html):
        data={}
        pos=get_pattern('''<div.*?/div>''', html, pos, [], data)
        pos=get_pattern('''<img *(src="([^"]*)"| *alt="([^"]*)")+[^>]*/>''', html, pos, [None, 'img_url', 'img_alt'], data)
        pos=get_pattern('''<h4[^>]*>(.*?)</h4>''', html, pos, ['name'], data)
        pos=get_pattern('''<p><strong>(.*?)</strong></p>''', html, pos, ['full_title'], data)
        # ''' 
        print(data)
        full_title_match=re.match('(?i)(President of The Supreme Court|Deputy President of The Supreme Court|Justice of The Supreme Court) *, *the right hon *(the)? *((Lord|Lady|Baroness)[-a-zA-Z ]+)(, *(.+))?', data['full_title'])
        if full_title_match:
            data['office']=full_title_match.group(1).strip()
            data['short_title']=full_title_match.group(3).strip()
            pn=full_title_match.group(6)
            if pn is not None:
                data['postnominal']=pn.strip()
            else:
                data['postnominal']=None
        else:
            sqlite.save(unique_keys=['full_title'], data=data, table_name='full_title_anomalies')
            print("Anomaly {0}".format(repr(data)))
            continue
        top_match=back_to_top_pattern.search(html, pos)
        if top_match:
            biography_html=html[pos:top_match.start()]
            pos=top_match.end()
        else:
            biography_html=html[pos:]
            pos=len(html)
        data['biography_html']=biography_html
        yield(data)
def get_item_tree(root_node):
    caters = root_node.xpath('//a[@class="level1"]/@href')
    caters_name = root_node.xpath('//a[@class="level1"]/text()')
    for (cat,name) in zip(caters,caters_name):
        val = int(re.findall('id_seccion=([0-9]*)',cat)[0])
        save(['id'],{'id':val,'name':name.encode('latin-1'),'parent':0,'leaf':0},table_name='categorias')
        get_children_from_cat(root_node,val)
def saveFilingTable(url):
    muffin = scrape(url)
    
    banana = fromstring(muffin)
    
    tea = banana.cssselect('table')
    you=tea[2]
    
    marcus = you.cssselect('tr')
    for jay in marcus[1:]:
        tractor = jay.cssselect('td,th')
        aidan = [apple.text_content() for apple in tractor]
        #print COLNAMES
        #print aidan
        #print zip(COLNAMES, aidan)
        data = dict(zip(COLNAMES, aidan))
        data['state']=data['location'].strip()[0:2]
        data['num_workers']=int(data['num_workers'])

        assert data['state'] in [
      'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA',
      'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD',
      'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ',
      'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC',
      'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

        save([],data)
def saveFilingTable(url):
    muffin = scrape(url)
    
    banana = fromstring(muffin)
    
    tea = banana.cssselect('table')
    you=tea[2]
    
    marcus = you.cssselect('tr')
    for jay in marcus[1:]:
        tractor = jay.cssselect('td,th')
        aidan = [apple.text_content() for apple in tractor]
        data  = dict(zip(COLNAMES,aidan))
        print(data)
        data['state']=data['location'].strip()[0:2]
        data['num_workers']=int(data['num_workers'])
        if data['state'] not in [ 'Na',
      'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA',
      'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD',
      'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ',
      'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC',
      'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']:
            data['state']='unknown'
        v = map(int,data['expiration_date'].split('-'))
        data['expiration_date'] = datetime.date(v[2]+2000,v[0],v[1])
        #data['expiration_date'] = datetime.datetime.strptime(
        #    data['expiration_date'],
        #    '%m-%d-%y'
        #).date()
        print(data)
        save([],data)
def scrape():
    d = getTopics(BOARD)
    save(['topic-href'], d, 'topics')
    topic_hrefs = [row['topic-href'] for row in d]
    for topic_href in topic_hrefs:
        d = parseTopic(topic_href)
        save([], d, 'links')
Ejemplo n.º 18
0
def getpdfs():
    html = parse('http://www.safaricom.co.ke/index.php?id=275').getroot()
    html.make_links_absolute()
    pdf_urls = html.xpath('//table[@class="contenttable" and @width="540"]/descendant::a/@href')
    
    for url in pdf_urls:
        save(['date_scraped', 'url'], {"date_scraped": DATE, "url": url, "pdfxml": pdftoxml(urlopen(url).read())}, 'pdfs')
def grab(startitem=1,extracolumns={},oncompletion=_oncompletion_default):
  #Grab
  _print('Downloading')
  xml=get_search_page(startitem)

  #Parse
  _print('Parsing')
  rows=parse(xml)

  #Add some new information
  search_id=copy(startitem)
  for row in rows:
    #Identifiers we know which items we've scraped.
    row['search_id']=search_id
    search_id=search_id+1
    #Any extra information
    row.update(extracolumns)

  #Save to the datastore
  save([],rows,'directory')

  #Recurse
  if is_last_page(xml):
    oncompletion()
  else:
    _print("Finished items "+' to '.join(map(str,current_items(xml)))+' of '+str(matched_items(xml)))
    _print("Searching for items "+str(startitem)+" to "+str(startitem+5))
    grab(startitem+5,extracolumns,oncompletion)
Ejemplo n.º 20
0
def deep_scrape(urn):
    data = {}

    def merge_in(d):
        "update data with d; complain if anything is overwritten"
        for (k, v) in d.iteritems():
            if k in data:
                assert data[k] == v, "%s: [%s] != [%s]" % (k, data[k], v)
            else:
                data[k] = v

    merge_in(summary_scrape(urn))
    merge_in(page_scrape("general", urn))
    merge_in(page_scrape("communications", urn))
    merge_in(page_scrape("regional-indicators", urn))

    try:
        if "Headteacher" not in data:
            data["Headteacher"] = "".join(
                [data["Headteacher Title"], data["Headteacher First Name"], data["Headteacher Last Name"]]
            )

        if data["Easting"] == "" or data["Northing"] == "":
            raise Exception("No Location Data")

        data = {key: data[key] for key in keys_to_keep}

        sqlite.save(unique_keys=["URN"], data=data)

    except Exception as e:
        print "Error: " + e.message
def process (name, date):
    newdate = date[8:10] + "_" + date[5:7] + "_" + date[0:4]
    url = r"http://www.lloydsbankinggroup.com/media/excel/2010/%s_historic_data.xls" % newdate
    print url
    url = r"http://www.lloydsbankinggroup.com/media/excel/2010/04_06_10_historic_data.xls"
    book = xlrd.open_workbook(file_contents=scrape(url))
    sheet = book.sheet_by_name (name)
    months=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']

    data = []
    i = 1
    while i < 500:
        try:
            month = sheet.cell_value (i, 0)
            year  = sheet.cell_value (i, 1)
            level = sheet.cell_value (i, 2)
        except:
            break
        when= "%04d-%02d-01" % (int(year), months.index (month) + 1)
        i = i + 1
        data.append (level)        
        sqlite.save(unique_keys=["Date"], data={"Date":when, "Index":level})

    chart = SimpleLineChart(500, 255, y_range=[0, 700])
    chart.add_data (data)
    metadata.save("chart", chart.get_url())
def main():
  """Check what has been scraped so far, then resume.
  It might be good to check for gaps in the scraping.
  Or maybe a recursive approach isn't the best for
  search pages like this."""

  #What's already been scraped recently?
  if not 'directory' in show_tables():
    last_searched=0
  else:
    #Only skip things from the current scraper completion attempt.
    if 'scrape_completions' in show_tables():
      raw_ids=select('scrape_ids from scrape_completions order by completion_id desc limit 1')[0]['scrape_ids']
      max_to_ignore=max(map(int,raw_ids.split(',')))
      min_to_scrape=max_to_ignore+1
    else:
      min_to_scrape=1
    incomplete_scrape=select('max("search_id") as m from directory where scrape_id>='+str(min_to_scrape))[0]['m']
    if incomplete_scrape!=None:
      last_searched=incomplete_scrape
    else:
      last_searched=0

  if 'scrape_times' in show_tables():
    last_id=select('max("scrape_id") as m from scrape_times')[0]['m']
  else:
    last_id=0

  #Time of scrape start
  scrape_id=last_id+1
  save(['scrape_id'],{"scrape_id":scrape_id,"scrape_time":time()},'scrape_times')
  grab(last_searched+1,{"scrape_id":scrape_id},oncompletion=oncompletion)
def locationscraper(locationpagesource, idcount):
    address = "not available"
    storeid = "error"
    latlong = "none"
    phone = "none"
    postal = "none"
    city = re.search('BR/>(.+?),', locationpagesource, re.I)
    city = city.group(1)
    city = re.sub("<BR/>", ", ", city)
    storeid = idcount
    latlong = re.search("init\((.+?),'<", locationpagesource)
    latlong = re.sub("'", "", latlong.group(1))
    if re.search('\(\d{3}\) \d{3}-\d{4}', locationpagesource, re.DOTALL|re.S|re.I):
        phone = re.search('\(\d{3}\) \d{3}-\d{4}', locationpagesource, re.DOTALL|re.S|re.I)
        phone = phone.group(0)
    if re.search('\D\d\D \d\D\d', locationpagesource):
        postal = re.search('\D\d\D \d\D\d', locationpagesource)
        postal = postal.group(0)
    row_data = {'Address': city, 'Latlong': latlong, 'Phone': phone, 'Postal Code': postal, 'Store ID': storeid}
    try: 
        save([],row_data)
    except: 
        city = "Address unavailable"
        row_data = {'Address': city, 'Latlong': latlong, 'Phone': phone, 'Postal Code': postal, 'Store ID': storeid}
        save([],row_data)
def Main():
    page = urllib2.urlopen("http://www.london.gov.uk/who-runs-london/greater-london-authority/expenditure-over-1000")
    soup = BeautifulSoup(page)
    for link in soup.html.body.findAll('a',{'href': re.compile(r'(csv)$')}):
        quotedLink = link['href'].replace(' ','%20')
        report = urllib2.urlopen(quotedLink).readlines()
        headerLine = findHeaderLine(report)
        reader = csv.DictReader(report[headerLine:])
    
        for rowNumber, row in enumerate(reader):
            #print row
            amount, currency = fixAmount(tryGetColumn(row,'Amount') or tryGetColumn(row,'Amount £') or tryGetColumn(row,'Amount Paid') or tryGetColumn(row, 'Amount\n\xa3') or tryGetColumn(row, 'Amount\n\x9c'))
            data = {
                    'link'        : quotedLink,
                    'rowNumber'   : rowNumber,
                    'supplier'    : tryGetColumn(row,'Supplier') or tryGetColumn(row,'Vendor') or tryGetColumn(row,'Vendor Name'),
                    'amount'      : amount,
                    'currency'    : currency,
                    'description' : tryGetColumn(row,'Expense Description') or tryGetColumn(row,'Expenditure Account Code Description'),
                    'docType'     : tryGetColumn(row,'Doc Type'),
                    'docNumber'   : tryGetColumn(row,'Doc No') or tryGetColumn(row,'SAP\nDocument No'),
                    'date'        : fixDate(tryGetColumn(row,'Date') or tryGetColumn(row,'Clearing \nDate'))
                    }
        
            if data['supplier'] and data['amount'] and data['description'] and data['amount']!='Amount Paid':
                sqlite.save(['link','date', 'amount', 'supplier'],data, date=data['date'])
def main():
  #What has already been scraped
  if 'contributions' in show_tables():
    scraped=[row['querystring'] for row in select('querystring from contributions')]
  else:
    scraped=[]

  pagenumber=0
  while True:
    pagenumber=pagenumber+1
    xml=load(pagenumber)

    #Get the header row
    rows=xml.xpath('//table[@class="table_text"][tr[@class="tan_row"]]')[0].getchildren()[1:]
    keys=['name','contestant_party_district','date_received','class_and_partnum','association','monetary','non-monetary']

    #Get the data rows
    ds=[]
    d={}
    for row in rows:
      cells=row.getchildren()
      contributor=cells.pop(0).getchildren()[0]

      d['querystring']=contributor.attrib['href'].replace("javascript:PopUp('contributor.aspx?",'').replace("', '300', '300');",'')
      d[keys[0]]=contributor.text
      for i in range(1,len(cells)):
        d[keys[i]]=cells[i].text
      ds.append(d)

    #Don't run again if already run
    if ds[0]['querystring'] in scraped:
      break
    else:
      save(['querystring'],ds,'contributions')
    def parse(self, text):
        cleaned_text = text.replace('\n','').replace('\r','').replace('\t','')
        html = fromstring(cleaned_text)
        tds=html.xpath('//td[a]')
        branches=[branchinfo(td) for td in tds]
        for row in branches:
            row['cityUrl'] = self.url

            splitchar = '\n' if row['address'].count('\n') > 0 else ','
            splitaddress=row['address'].split(splitchar)

            l=len(splitaddress)
            if l==3:
                row['street-address'],row['subtown'],row['town2']=splitaddress
            elif l==2:
                row['street-address'],row['subtown']=splitaddress
            elif splitaddress == ['']:
                print 'Empty address'
            else:
                print row['map_Address_']
                print splitaddress
                raise ValueError('Weird address')

            if row.has_key('street-address'):
                row['street-address'] = row['street-address'].strip()

            row['address'] = strip_address(row['address'])
            row['scraperrun'] = scraperrun

        save(['scraperrun', 'cityUrl'], branches,'branches')
def apples():
    download = urlopen('http://www.dol.gov/olms/regs/compliance/cba/Cba_CaCn.htm')
    rawhtml = download.read()
    
    html = fromstring(rawhtml)
    tables = html.cssselect('table')
    table = tables[2]
    
    trs = table.cssselect('tr')
    for tr in trs[1:]:
        tds = tr.cssselect('td')
        cell_values = [td.text_content() for td in tds]
        data = dict(zip(COLUMN_NAMES, cell_values))
        print data
        
        #state
        if data['location'][:2] in STATES:
            data['state'] = data['location'][:2]
        
        data['num_workers'] = int(data['num_workers'])
        data['expiration_date'] = datetime.datetime.strptime(data['expiration_date'], '%m-%d-%y').date()
    
        links = tr.cssselect('a')
        if len(links) == 1:
            data['pdf'] = 'http://www.dol.gov/olms/regs/compliance/cba/' + links[0].attrib['href']
        elif len(links) > 1:
            assert False
    
        print data
        save([], data)
def locationscraper(locationpagesource, idcount):
    address = "not available"
    storeid = "error"
    lat = "none"
    longitude = "none"
    phone = "None available"
    country = "Not listed"
    city = re.search('class="locality">(.+?)<', locationpagesource, re.DOTALL|re.S)
    city = re.sub('&nbsp;', '', city.group(1))
    city = re.sub(',', '', city)
    if re.search('class="street-address"', locationpagesource):
        address = re.search('class="street-address">(.+?)<', locationpagesource, re.DOTALL|re.S)
        address = address.group(1)
    if re.search('class="country-name">(.+?)<', locationpagesource):
        country = re.search('class="country-name">(.+?)<', locationpagesource)
        country = country.group(1)
    storeid = idcount
    lat = re.search('data-store-lat="(.+?)"', locationpagesource)
    lat = lat.group(1)
    longitude = re.search('data-store-lon="(.+?)"', locationpagesource)
    longitude = longitude.group(1)
    if re.search('\d{3}-\d{3}-\d{4}', locationpagesource, re.DOTALL|re.S|re.I):
        phone = re.search('\d{3}-\d{3}-\d{4}', locationpagesource, re.DOTALL|re.S|re.I)
        phone = phone.group(0)
    row_data = {'Address': address, 'City': city, 'Country': country, 'Lat': lat, 'Long': longitude, 'Phone': phone, 'Store ID': storeid}
    save([],row_data)
def parse_branch(xml,url,region):

  #Get table
  max_trs=max([table.xpath('count(tr)') for table in xml.xpath('//table')])
  table_nodes=xml.xpath('//table[count(tr)=%d]'%max_trs)

  #Check
  l=len(table_nodes)
  if l!=1:
    raise ParseError("I could not identify the appropriate table; %d candidates were found." % l)
  else:
    table=table_nodes[0]

  #Parse
  #from lxml.html import tostring
  #print tostring(table)
  d=parse_branch_table(table)
  d=parse_branch_table_strings(d)
  for row in d:
    row['date_scraped']=DATE
    row['region']=region
    row['url']=url
    row['full-address'] = strip_address(row['full-address'])
    row['street-address'] = strip_address(row['street-address'])
  #print [row.keys() for row in d]
  save([],d,'branches')
def analyze():
    d = select("""
  `link-href`, GROUP_CONCAT(`author`) AS `authors`, count(*) AS "count"
FROM `links`
JOIN `topics` ON `links`.`topic-href` = `topics`.`topic-href`
GROUP BY `link-href`
""")
    execute('DROP TABLE IF EXISTS `wrote-about-same-things`')
    save([], d, 'wrote-about-same-things')
    print '''
These look most exciting because three different people wrote about each.

3    Kiana Fitzgerald,Sara Peralta,Susan Raybuck    http://schedule.sxsw.com/2012/events/event_IAP100409
3    Shawn Dullye,Joe Vasquez,Sara Peralta          http://schedule.sxsw.com/2012/events/event_IAP10593
3    Shawn Dullye,Kiana Fitzgerald,Sara Peralta     http://schedule.sxsw.com/2012/events/event_IAP13848

Of course, that isn't adjusted for how many each person wrote.
'''

    d = select("""
  author, count(*) AS `how-many` FROM `links`
JOIN topics on links.`topic-href` = topics.`topic-href`
GROUP BY author
ORDER BY 2 DESC
""")
    save(['author'], d, 'how-many-did-you-link')
    print """
    def parse(self, text):
        html = fromstring(text)
        citiesParent = html.xpath('//select') #This should actually have an option child, but lxml fixes the wrong html
        assert len(citiesParent)==1
    
        cities=options(citiesParent[0],valuename="cityId",textname="cityName",ignore_value="0")
        for city in cities:
            city['provinceUrl'] = self.url
            city['cityUrl'] = URLS['cities-base'] + city['cityId']
            city['scraperrun'] = scraperrun

        save(['cityUrl', 'scraperrun'], cities, 'cities')
        return [City(c['cityUrl']) for c in cities]
def download():
  d=[]
  for letter in ascii_lowercase:
    x=search_letter(letter)
    branch_tables=x.cssselect('table.locatorTable table')
    d_letter=[extract_branch_info(branch_table) for branch_table in branch_tables]

    for record in d_letter:
      record['url']=searchurl(letter)
      record['date_scraped']=DATE

    d.extend(d_letter)
  save([],d)
def scrape_pct(link,pct_name):
    """
    Scrapes the data associated with the PCT, and calls functions to scrape
    data associated with the services.
    """
    
    url = "http://www.nhs.uk" + link
    root = lxml.html.parse(url).getroot()

    d = {}

    # basic contact details
    d["PCT"] = pct_name
    d["type"] = "main"
    d["name"] = pct_name
    print lxml.html.tostring(root)
    address = root.cssselect("div.panel-content div.pad p")[0].text
    d["address"] = address
    d["postcode"]= geo.extract_gb_postcode(address)
    try:
        d["lat"], d["lng"] = geo.gb_postcode_to_latlng(d["postcode"])
    except:
        print "Postcode not found", d["postcode"]
    d["info HTML"] = url

    colour = "green"
    # quality
    for t in root.findall("body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']"):
        k = t.find("div/h4").text.strip()
        v = t.find("div/img").attrib["alt"]
        d[k] = v
        if k == "Fair":
            colour = "yellow"
    d["colour"] = colour

    # head honcho
    for t in root.findall("body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']"):
        d["Boss"] = t.text.replace("<br />",", ")

    # boring text
    for t in root.findall("body/div/form/div/div/div/div/div/div/p"):
        if t.text:
            if t.attrib.get("class",False)=="intro":
                d["intro text"] = t.text
            else:
                d["boilerplate"] = (d.get("boilerplate","")+"\n"+t.text).strip()

    sqlite.save(unique_keys=["PCT","type","name"], data=d)
    
    scrape_facilities(pct_name,root)
    scrape_others(pct_name,url)
Ejemplo n.º 34
0
def main():
    out = []
    #outfile = open('rwandamicrofinance.json','w')
    kinds = {('7','123'): 'MFI', ('8','124'): 'Unions', ('9','125'): 'SARL', ('10','126'): 'SA'}
    for (k1,k2),v in kinds.iteritems():
        start = 'index.php?option=com_sobi2&catid=%s&Itemid=%s&lang=en'%(k1,k2)
        result, n = eachpage(start, v)
        out.extend(result)
        while n!=None:
            result,n = eachpage(n, v)
            out.extend(result)
    for row in out:
        row['date_scraped'] = DATE
    save([], out)
Ejemplo n.º 35
0
def ScrapeProfile(site, link):
    page = urlopen(link)
    rawtext = page.read()
    html = fromstring(rawtext)
    print tostring(html)
    tables = html.cssselect('table') #3 tables on profile, all with valid information
    
    try:
        imgs = html.cssselect('img')
        data = {'image_link': site + imgs[1].attrib['src']}
    except IndexError:
        print "image not available"
        data = {'image_link': 'None'}
    
    divinfo = html.cssselect("#inmateAddress")
    address = str(divinfo[0].text_content().strip())
    address = address[5:].strip()
    data['address'] = address

    divinfo = html.cssselect("#holdingLocation")
    location = str(divinfo[0].text_content().strip())
    location =  location[17:]
    data['location'] = location
    

    Table_HEADERS = [['id','name','book_date'],
    ['age','height','weight','race', 'sex', 'eye', 'hair'],
    ['case_num', 'description', 'bond_amount', 'bond_type']]

    for i in range(2):
        tabledata = []
        for tr in tables[i].cssselect('tr'): #this table contains ID, NAME, BOOKDATE
            cellvalues = [td.text_content().strip() for td in tr.cssselect('td')]
            tabledata.extend(cellvalues)
        data = dict(data.items() + dict(zip(Table_HEADERS[i], tabledata)).items())    
    
    
    for tr in tables[2].cssselect('tr')[1:]: #Table 2 contains case number(s), description and cash or Bond type
        cellvalues = [td.text_content().strip() for td in tr.cssselect('td')]
        data1 = dict(zip(Table_HEADERS[2], cellvalues))
        data3 = dict((data.items() + data1.items()))
        data3['age'] = int(data3['age'])
        data3['weight'] = int(data3['weight'])
        data3['id'] = int(data3['id'])
        data3['bond_amount'] = int(data3['bond_amount'].strip('$').replace(',',''))
        data3['book_date'] = datetime.datetime.strptime(data['book_date'], '%m/%d/%Y %I:%M %p').date()
        #print data3
        data3['id_CASENUM'] = str(data3['id']) +'_' + data3['case_num'] +'_' + data3['description'][:6] #used for unique key
        print data3['id_CASENUM']
        save(['id_CASENUM'],data3)
def parse(url,nodeIds):
  xml=get(url)
  for a in xml.xpath('//div[@class="profile-container"]/div[@class="node-body"]/a'):
    nodeId=a.attrib['href'].split('/')[-1]
    if nodeId in nodeIds:
      #Remove it to speed up future runs
      nodeIds.pop(nodeId)
    else:
      #Add it to the database
      d={
        "nodeId":nodeId
      , "first_scraped":time()
      }
      save(['nodeId'],d,'nodeIds')
def branchinfo(href):
    x = fromstring(urlopen(DIR + href).read())
    for thingtype in ("Sub-Branches", "Outlets"):
        locations = [
            loc.strip() for loc in x.xpath(
                '//p[strong/text()="%s"]/following-sibling::p[position()=1]/text()'
                % thingtype)
        ]
        d = [{
            "location": location,
            "date_scraped": DATE,
            "branch-href": href
        } for location in locations]
        save([], d, "branch_" + thingtype)
Ejemplo n.º 38
0
def save_page(url,table_name="pages"):
  "Save a url directly to the datastore."
  try:
    handle=urlopen(url)
    text=handle.read()
  except urllib2_URLError as e:
    badurl(url,e)
  except HTTPError as e:
    badurl(url,e)
  except BadStatusLine as e:
    badpage(url,e)
  else:
    d={"url":url,"text":text}
    save(['url'],d,table_name)
  def descend(self,selectIndex=0):
    """Traverse the form fields in a depth-first fashion.
Sometimes, a form will provide no responses, but this isn't actually a problem
because the loop just does nothing in that case."""
    select=self.SELECTS[selectIndex]
    options=getattr(self,'extract_%s' % select['plural'])()
    save([],options,select['plural'])
    for option in options:
      getattr(self,'submit_%s' % select['singular'])(option['value'])
      if self.is_megabus():
        option['is_megabus']=True
        save([],option,select['plural'])
      elif selectIndex < len(self.SELECTS)-1:
        self.descend(selectIndex+1)
def main():
  blocks=get_blocks()
  blockId=0
  for block in blocks:
    blockId+=1
    block_info=block.data()
    block_info['blockId']=blockId
    block_info['date_scraped']=DATE
    save([],block_info,'blocks')
    for branch in block.branches():
      branch_info=branch.data()
      branch_info['blockId']=blockId
      branch_info['date_scraped']=DATE
      save([],branch_info,'branches')
Ejemplo n.º 41
0
def main():
  #Load
  xml=swimport('dsp').dsp('http://www.khula.org.za/Admin/Contacts/RegionalContacts.aspx',False)

  #Parse
  t_nodes=xml.xpath('//table[@width="100%"]')
  assert len(t_nodes)==1
  table=t_nodes[0]
  d=parse_table(table)
  t=time()
  for row in d:
    row["date_scraped"]=t
  d=moreparsing(d)
  save([],d,'final')
Ejemplo n.º 42
0
def parse(url,xml=None,suffix=''):
  if xml==None:
    xml=pull(url)
  sunday=xml.xpath('//h2[@class="date-header"]')[0].text

  twosided=xml.xpath('//div[@class="flipit"]/a[@onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}"]')

  #Get the postcards
  postcards=xml.xpath('//div[@class="flipit"]/a[@onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}"][2]')
  for postcard in xml.xpath('//a[@onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}"]'):
    if not postcard in twosided:
      postcards.append(postcard)

  for a in postcards:
    if not 'bp.blogspot.com' in a.attrib['href']:
      #Not a postcard
      break

    save(["url","image"],image(a.attrib['href']),"images"+suffix)

    if _isTwosided(a):
      url2=a.getprevious().attrib['href']
      save(["url","image"],image(url2),"images"+suffix)
      save(["url1","url2"]
      , meta(a,sunday,url2=url2)
      , "postcards"+suffix)

    else:
      save(["url1"]
      , meta(a,sunday)
      , "postcards"+suffix)
Ejemplo n.º 43
0
def main():
    Fall2012 ='https://www.american.edu/provost/registrar/schedule/schedule-results.cfm?term=2012FN&subj=&search=&mode=title&stat=ALL&hr=&mn=&ampm=AM&class=Search+Courses'
    Fallpage = urlopen(Fall2012)
    rawtext = Fallpage.read()
    html = fromstring(rawtext)
    print tostring(html)
    
    COURSE_KEYS = ['CourseNum', "Title", "Prerequisite", "Course Description"]
    SECTION_KEYS = ['Status','section','credit','instructor','time']    

    maindivs = html.cssselect(".crs-data")
    for crs in maindivs:
        COURSEdata = []
        header =crs.cssselect('.crs-header')[0]
        secs = crs.cssselect('.sec-details')
        #print tostring(header[0]), tostring(secs[0])
        headerdivs = header.cssselect('div')[1:]
        COURSEdata.extend([div.text_content().strip() for div in headerdivs[:2]])
        
        if len(headerdivs)==5:
            prereq = headerdivs[3].text_content().strip()
            COURSEdata.append(prereq[14:])
        else:
           COURSEdata.append('') 

        descriptionlink = 'https://www.american.edu/provost/registrar/schedule/' + headerdivs[2].cssselect('a')[0].attrib['href']
        descriptionrawtext = fromstring(urlopen(descriptionlink).read())
        try:
            COURSEdata.append(descriptionrawtext.cssselect('.course-header')[0].cssselect('p')[1].text_content().strip())
        except:
            COURSEdata.append('NONE')

        COURSEdata = dict(zip(COURSE_KEYS,COURSEdata))
        #print COURSEdata
        
        for sec in secs:
            SECdata = []
            sectionDivs = sec.cssselect('div')[1:]
            SECdata.append(sectionDivs[0].text_content().strip()) #status
            SECdata.append(sectionDivs[1].text_content().strip()) #sectionNum
            SECdata.append(sectionDivs[3].text_content().strip()) #credits
            SECdata.append(str(sectionDivs[4].text_content().strip())) #professor
            SECdata.append(str(sectionDivs[8].text_content().strip())) #times
            SECdata = dict(zip(SECTION_KEYS,SECdata))
            SECdata = dict(COURSEdata.items() + SECdata.items())
            #SECdata['section'] = int(SECdata['section'])
            #SECdata['credit'] = int(SECdata['credit'])
            SECdata['key'] = SECdata['CourseNum'] + str(SECdata['section']) #used as unique key
            save(['key'],SECdata)
def Main():
    page = urllib2.urlopen(
        "http://www.london.gov.uk/who-runs-london/greater-london-authority/expenditure-over-1000"
    )
    soup = BeautifulSoup(page)
    for link in soup.html.body.findAll('a', {'href': re.compile(r'(csv)$')}):
        quotedLink = link['href'].replace(' ', '%20')
        report = urllib2.urlopen(quotedLink).readlines()
        headerLine = findHeaderLine(report)
        reader = csv.DictReader(report[headerLine:])

        for rowNumber, row in enumerate(reader):
            #print row
            amount, currency = fixAmount(
                tryGetColumn(row, 'Amount') or tryGetColumn(row, 'Amount £')
                or tryGetColumn(row, 'Amount Paid')
                or tryGetColumn(row, 'Amount\n\xa3')
                or tryGetColumn(row, 'Amount\n\x9c'))
            data = {
                'link':
                quotedLink,
                'rowNumber':
                rowNumber,
                'supplier':
                tryGetColumn(row, 'Supplier') or tryGetColumn(row, 'Vendor')
                or tryGetColumn(row, 'Vendor Name'),
                'amount':
                amount,
                'currency':
                currency,
                'description':
                tryGetColumn(row, 'Expense Description')
                or tryGetColumn(row, 'Expenditure Account Code Description'),
                'docType':
                tryGetColumn(row, 'Doc Type'),
                'docNumber':
                tryGetColumn(row, 'Doc No')
                or tryGetColumn(row, 'SAP\nDocument No'),
                'date':
                fixDate(
                    tryGetColumn(row, 'Date')
                    or tryGetColumn(row, 'Clearing \nDate'))
            }

            if data['supplier'] and data['amount'] and data[
                    'description'] and data['amount'] != 'Amount Paid':
                sqlite.save(['link', 'date', 'amount', 'supplier'],
                            data,
                            date=data['date'])
Ejemplo n.º 45
0
def main():
  if not 'cities_done' in show_tables():
    cities_done=[]
  else:
    cities_done=select('* from cities_done')

  for fromcity in CITIES_NY:
    for tocity in CITIES_NY:
      if fromcity==tocity:
        print 'Skipping within-%s route' % fromcity
      elif {"from":fromcity,"to":tocity} in cities_done:
        print 'Already scraped %s to %s' % (fromcity,tocity)
      else:
        grab(fromcity,"NY",tocity,"NY")
        save([],{"from":fromcity,"to":tocity},'cities_done')
Ejemplo n.º 46
0
def geocode():
    if "address" not in show_tables():
        initialize()

    while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0:
        address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0]
        #print address
        if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0:
            d = all_services(address['address-input'])
            for row in d:
                row['address-input'] = address['address-input']
            save([], d, 'geocode')
        params = (address['address-column'], address['address-input'])
        execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params )
        commit()
Ejemplo n.º 47
0
 def parse(self, text):
     text = '\n'.join(text.split('\n')[2:4]).replace("document.getElementById('bizdir_directory').innerHTML = '", '')
     text = re.sub(r"';\s*document.getElementById('bizdir_search').disabled = false;", '', text).replace("&nbsp;&nbsp;8</div>';", '&nbsp;&nbsp;8</div>').replace("\\'", '')
     html = fromstring(text)
     bizdir_directory = []
     for tr in html.cssselect('#bizdir_directory tr'):
         try:
             assert tr.xpath('count(td)') == 1
             name = element(tr, 'td/b').text_content()
             description = element(tr, 'td/p/text()')
             bizdir_directory.append({'name': name, 'description': description, 'pageOffset': self.offset, 'scraperrun': scraperrun})
         except:
             print tostring(tr)
             raise
     save(['scraperrun', 'pageOffset', 'name'], bizdir_directory, 'organizations')
def join():
  disclosures=select('Entity,upper(Entity) as "ENTITY" from disclosures where entity is not null')
  disclosures_cleaned=[{
    "raw":row['Entity']
  , "clean":remove_ny(row['ENTITY']).strip()
  } for row in disclosures]
  save([],disclosures_cleaned,'disclosures_cleaned')


  licenses=select('Vendor,upper(Vendor) as "VENDOR" from swdata where Vendor is not null')
  licenses_cleaned=[{
    "raw":row['Vendor']
  , "clean":remove_ny(row['VENDOR']).strip()
  } for row in licenses]
  save([],licenses_cleaned,'licenses_cleaned')
def moreparsing_map():
  "Map along the most recent results in the table (like a Couch map) and return a new one"
  d=select("* FROM `swdata` WHERE date_scraped=(SELECT max(date_scraped) from `swdata`);")
  for row in d:
    row['street-address'],row['postal-code']=splitAddress(row['Address_'])
    row['town']=extractTown(row['branchName'])
  if 'final' in show_tables():
    execute('DROP TABLE `final`;')

  d_final = []
  for row in d:
    if row['regionName'] not in ["Botswana", "Malawi", "Nambia"]:
      d_final.append(row)

  save([],d_final,'final')
Ejemplo n.º 50
0
def find_similar_research():
    research = select('url, value from maincol where url != ?;', [reference_person])
    research.extend(select('url, value from descriptions where url = ?;', [reference_person]))
    documents = [row['value'].strip() for row in research]
    stoplist = set('for a of the and to in'.split())
    texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    vec = corpus.pop() #The person being compared to
    
    tfidf = models.TfidfModel(corpus)
    index = similarities.SparseMatrixSimilarity(tfidf[corpus])
    sims = index[tfidf[vec]]

    save(['url'], [{"url": row[0], "similarity": row[1][0]} for row in zip([row['url'] for row in research], list(enumerate(sims)))], 'similarity')
def iter_children_areas_kml(parent_id):
    children = getjs('http://mapit.mysociety.org/area/%s/children' % parent_id)
    if 'error' in children:
        raise RuntimeError(children['error'])
    db.save(['parent_area'], {'parent_area': parent_id, 'count': len(children)}, table_name="counts", verbose=0)
    for id, data in children.items():
        kml = requests.get('http://mapit.mysociety.org/area/%s.kml' % id).content
        if POLYGON_ONLY:
            kml = extract_polygon(kml)
        entry = {'parent_area': int(data['parent_area']),
                 'id': int(id),
                 'name': data['name'],
                 'kml': kml}
        yield entry
        time.sleep(SLEEP_TIME)
Ejemplo n.º 52
0
def exceeded(runId):
    print "---------------------------------------------------------"
    print "Wow, we caught the exception."
    print "Printing the current time so we see how long we have"
    start_time = time()
    while True:
        current_time = time()
        time_after_exception = current_time - start_time
        save(
            [], {
                "time_after_exception": time_after_exception,
                "time": current_time,
                "runId": runId
            })
        long(812323525)**long(624333)
        sleep(1)
Ejemplo n.º 53
0
def cp1():
  p=Page('CP1')

  while p.lastpage()==False:
    tables=p.table().subtables()
    d = []
    for table in tables:
        row = table.parse()
        row['business_premises'] = table.business_premises()
        d.append(row)
        print row

    more_cleaning(d,p.pagenum)
    save([],d,'cp1')
    randomsleep()
    p=p.next25()
Ejemplo n.º 54
0
def main():
  x = fromstring(urlopen(URL).read().replace('<br />','\n').replace('&nbsp;',' '))
  ps = x.xpath('//td[@width="596"]/p')
  d = []
  for p in ps:
    text = p.text_content()
    lines = [line.strip() for line in text.split('\n')]
    row = {"entity":lines.pop(0)}
    for line in lines:
      line = line.replace(' : ', '')
      if line != '':
        key, value = line.split(':')
        row[key] = value
      row['date_scraped'] = DATE
    d.append(row)
  save([], d)
Ejemplo n.º 55
0
def parse_entry(trs):
  """Given the full list of trs, extract data from the first database entry
  and remove the first database entry from the trs."""
  d={}
  tr=trs.pop(0)
  while (not is_entry_divider(tr)) and len(trs)>0:
    pairlist=tr.xpath('descendant::*[self::font or self::a]/text()')
    if len(pairlist)!=2:
      _print("Extraction of this key-value pair was less standard.")
      _print(pairlist)
      save(['pair'],{"time":time(),"pair":'|'.join(pairlist)},'nonstandard_pairs')
    key=pairlist[0]
    value=''.join(pairlist[1:])
    d[keyify(key)]=value
    tr=trs.pop(0)
  return d
Ejemplo n.º 56
0
def main():
  if None==get_var('DATE'):
    save_var('DATE',time())

  searchTerms=get_searchTerms()
  for searchTerm in searchTerms:
    d=paginate(searchTerm)
    for row in d:
      row['date_scraped']=get_var('DATE')
      row['searchTerm']=searchTerm

    save_var('previous_searchTerm',searchTerm)
    save(['date_scraped', 'Name'],d,'initial')

  save_var('previous_searchTerm',None)
  save_var('DATE',None)
Ejemplo n.º 57
0
def separate_addresses():
    execute('DROP TABLE IF EXISTS final')
    commit()
    d = select('* from `initial`')
    for row in d:
        splitaddress = row['address'].split('\n')
        l = len(splitaddress)
        if l == 3:
            row['street-address'], row['subtown'], row['town2'] = splitaddress
        elif l == 2:
            row['street-address'], row['subtown'] = splitaddress
        else:
            raise AddressError
        row['street-address'] = row['street-address'].strip()
        row['address'] = strip_address(row['address'])
    save([], d, 'final')
  def parse_and_save(self):
    job=self.parse_main()
    lobbyists=self.parse_lobbyists()
    detail=self.parse_detail()

    save(['jobId'],job,'lobbyists',verbose=False)
    save([],lobbyists,'lobbyists_lobbyists',verbose=False)
    save([],detail,'lobbyists_details',verbose=False)
    save(['jobId','sourceUrl'],{"jobId":self.jobId,"sourceUrl":self.url},'lobbyists_urls')
Ejemplo n.º 59
0
def cp1():
  execute('''
CREATE TABLE IF NOT EXISTS `businessPremises` (
  `date_scraped` REAL,
  `businessPremisesURL` TEXT,
  FOREIGN KEY (date_scraped, businessPremisesUrl)
  REFERENCES cp1(date_scraped, businessPremisesUrl)
)
''')

  if get_var('crashed') == 1:
    pagenum = select('max(pagenum) from cp1 where date_scraped = (select max(date_scraped) from cp1)')[0]['max(pagenum)']
    print "Resuming from page %d" % pagenum
    p = Page('CP1')
    p = Page('CP1', s=p.s, pagenum=pagenum)
  else:
    print "Starting a new run"
    p = Page('CP1')

  while p.lastpage()==False:
    print "Beginning page %d" % p.pagenum
    tables=p.table().subtables()
    d = []
    for table in tables:
        row = table.parse()
        row['businessPremisesURL'] = table.business_premises_url()

        try:
            business_premises_data, more_registrant_data = table.business_premises(p.s)
        except Exception, msg:
            print "Error on %s: msg" % table.business_premises_url()
            sleep(60)
            print "Trying again"
            business_premises_data, more_registrant_data = table.business_premises(p.s)

        row['date_scraped']=DATE
        row['pagenum']=p.pagenum
        row['url']=URL+"?page=%d"%p.pagenum

        row.update(more_registrant_data)

        save([], business_premises_data, 'businessPremises')
        save(['date_scraped', 'businessPremisesURL'],row,'cp1')

        sleep(1)
    save_var('crashed', 1)
    p=p.next25()
def parse():
  d = select('* from raw')
  for row in d:
    lines = row['contact-info'].split('Email')[0].split('\n')
    row['town'] = re.findall(r'[A-Z]+', lines[0])[0]
    for cell in lines:
      if 'B.P' == cell[0:3]:
        row['street-address'] = cell
      elif 'Email' in cell:
        row['email'] = cell
      elif 'Mob' in cell:
        row['mob'] = cell
      elif 'Phone' in cell:
        row['phone'] = cell
      else:
        print cell
  save(['date-scraped','branch-number'],d,'parsed')