def parse(rowid): d=grab(rowid) if len(d['rows'])>0: row=d['rows'][0] row['id']=rowid save(['id'],row,'organic_operations') return {"done":d['page']>=d['records']}
def saveFilingTable(url): #scrape the url raw=scrape(url) #parse the scrape parsed = fromstring(raw) #find the target table trgTable = parsed.cssselect("table")[2] #extract all the rows rows = trgTable.cssselect("tr") #loop through each row for row in rows[1:]: cells = row.cssselect("td,th") cellcontents = [cell.text_content() for cell in cells] data = dict(zip(COLNAMES,cellcontents)) data['num_workers'] = int(data['num_workers']) data['state'] = data['location'].strip()[0:2] if data['state'] not in STATES: data['state'] = 'unknown' v = map(int,data['expiration_date'].split('-')) data['expiration_date'] = datetime.date(v[2]+2000,v[0],v[1]) save([],data)
def main(testing = False): out = '' x = fromstring(urlopen('https://views.scraperwiki.com/run/mix_scraper_spreadsheets/?date='+str(time())).read()) csv_links = x.xpath('//td[position()=3]/a/@href') if testing: csv_links = csv_links[0:2] #Manual data csv_links.append('http://hacks.thomaslevine.com/manual-SA-data-cleaned.csv') #Standard Bank data, which was run with Highwall instead of ScraperWiki csv_links.append('http://hacks.thomaslevine.com/standardbank-branches-cleaned.csv') csv_links.append('http://hacks.thomaslevine.com/standardbank-atm.csv') header0, body = getCsv(csv_links[0]) out += header0[:-2] for csv_link in csv_links: header, body = getCsv(csv_link) if header0 == header: out += body[:-2] else: header_pairs = zip(header0.split(','), header.split(',')) for pair in header_pairs: if pair[0] != pair[1]: print pair raise ValueError("Headers from %s and %s don't match." % (csv_links[0], csv_link)) save(['time'], {"time": time(), "spreadsheet": out}, 'combined_spreadsheets')
def main(): d=[] for href in get_office_type_hrefs(): d.extend(get_office_info(href)) for row in d: row['date_scraped']=DATE save([],d,'final')
def record_error(self,error_detail,error_at): save([],{ "request":dumps(self.r.request.data) , "request_content":self.r.content , "error_detail":error_detail , "error_at":error_at },'errors')
def search_directory_tree(id,js='getlaw("LAWS","","MAIN")',level=1): try: sleep(INTERVAL) # print 'Searching for %s on level %d' % (js,level) save_if_menu(id,js) foo,bar,baz=(eval(js.replace('getlaw',''))) raw=getlaw(foo,bar,baz) xml=fromstring(raw) links=get_law_links(xml,js) if 0==len(links): #If there aren't any links, we've reached the lowest level save_raw_text(id,raw,time()) save_law_text(id,xml,time()) save_state(js,level) else: #If there are links, save them and descend into them in a depth-first fashion #There will only be five levels of recursion, so this is okay for Python even though it doesn't support TRE for link in links: link['observation']['parentjs']=parentjs link['observation']['level']=level save(['id'],[link['meta'] for link in links],META) save(['id'],[link['observation'] for link in links],OBS) save_state(js,level) for link in links: nextpage=link['observation']['js'] search_directory_tree(nextid(),nextpage,level+1) except: log_error(js=js) raise
def save_raw_text(id,raw,date_scraped): d={ "id":id , "rawtext":raw , "date_scraped":date_scraped } save(['id'],d,'law_text')
def main(): if get_var('columns_to_do') == None: columns = COLUMNS else: columns = loads(get_var('columns_to_do')) while len(columns) > 0: column = columns[0] d = load_data(column) out = [] for row in d: p = Place(row[column], (row['latitude'], row['longitude']) ) row_geocode = p.geocode() row_geocode.update({ "address-column":column, "branchId": row['branchId'] }) out = row_geocode sleep(3) save([], out, 'geocoded') columns.remove(column) if len(columns) == 0: save_var('columns_to_do',None) else: save_var('columns_to_do',dumps(columns))
def geocode(this): this.extract_location() for location in this.locations: try: locs_geo=G.geocode(location,exactly_one=False) except geocoders.google.GQueryError: pass except: #You didn't see anything pass else: exact=len(locs_geo)==1 if not exact: indices=range(len(locs_geo)) indices.reverse() for i in indices: #print 'Skipping %s' % locs_geo[i][0] if 'Egypt' not in locs_geo[i][0]: locs_geo.pop(i) for loc in locs_geo: location_geo,(latitude,longitude)=loc save([],{ "tweet_id":this._tweet['id'] , "place_raw":location , "place_geo":location_geo , "latitude":latitude , "longitude":longitude , "exact":exact },'geocode')
def parse_pdf_header(page_data): sc_data_agg = [] nums = range(5, 16) for num in nums: sc_data = page_data.cssselect('text')[num].text_content() match = re.search(r':', sc_data) if match: sc_data = sc_data.split(':', 1)[1].strip() sc_data_agg += [sc_data] data = dict(zip(variables, sc_data_agg)) if data['address'] != '': us = geocoders.GeocoderDotUS() place, (lat, lng) = us.geocode(data['address']) print "%s: %.5f, %.5f" % (place, lat, lng) data['lat'] = lat data['lng'] = lng save([], data) print data
def scrape_table(url): download = urlopen(url) raw = download.read() html = fromstring(raw) table = html.cssselect('table')[2] #print [th.text_content() for th in table.cssselect('th')] for tr in table.cssselect('tr')[1:]: cell_text = [td.text_content() for td in tr.cssselect('td')] data = dict(zip(COLUMN_NAMES, cell_text)) data['num_workers'] = int(data['num_workers']) if data['location'][:2] in STATES: data['state'] = data['location'][:2] data['expiration_date'] = datetime.datetime.strptime( data['expiration_date'], '%m-%d-%y').date() a_elements = tr.cssselect('a') if len(a_elements) > 1: raise ValueError('Row has multiple a tags.') elif len(a_elements) == 1: data['pdf'] = 'http://www.dol.gov/olms/regs/compliance/cba/' + a_elements[0].attrib['href'] elif len(a_elements) == 0: pass #print data save([], data)
def parse_and_save(root): global podcasts_count links = root.xpath('//div[@class="ContentTabla"]/ul/li')[1:] for link in links: url = 'http://www.rtve.es' + link.xpath('span[@class="col_tit"]/a/@href')[0] titulo = link.xpath('span[@class="col_tit"]/a/text()')[0].encode('latin-1') # A algunos les falta el botón de descarga, pero el mp3 parece que sí está (ej.: pg 9) # http://www.rtve.es/alacarta/audios/carne-cruda/carne-cruda-paralisis-permanente-revive-07-03-12/1342911/ try: url_mp3 = 'http://www.rtve.es' + link.xpath('span[@class="col_tip"]/a/@href')[0] except IndexError: print 'WARNING: Download not available:', url url_mp3 = None tipo = "".join(link.xpath('span[@class="col_tip"]/text()')).strip() duracion = link.xpath('span[@class="col_dur"]/text()')[0] popularidad = link.xpath('span[@class="col_pop"]/span/em/strong/span/text()')[0] fecha = link.xpath('span[@class="col_fec"]/text()')[0] desc_corta = link.xpath('div//span[@class="detalle"]/text()')[0].encode('latin-1') save([], {'titulo':titulo, 'url':url, 'url_mp3':url_mp3, 'tipo':tipo, 'duracion':duracion, 'popularidad':popularidad, 'fecha':fecha, 'descripcion_corta':desc_corta}) print '%s: %s' %(fecha, titulo) podcasts_count = podcasts_count +1
def justice_generator(html, current): pos=0 while pos < len(html): data={} pos=get_pattern('''<div.*?/div>''', html, pos, [], data) pos=get_pattern('''<img *(src="([^"]*)"| *alt="([^"]*)")+[^>]*/>''', html, pos, [None, 'img_url', 'img_alt'], data) pos=get_pattern('''<h4[^>]*>(.*?)</h4>''', html, pos, ['name'], data) pos=get_pattern('''<p><strong>(.*?)</strong></p>''', html, pos, ['full_title'], data) # ''' print(data) full_title_match=re.match('(?i)(President of The Supreme Court|Deputy President of The Supreme Court|Justice of The Supreme Court) *, *the right hon *(the)? *((Lord|Lady|Baroness)[-a-zA-Z ]+)(, *(.+))?', data['full_title']) if full_title_match: data['office']=full_title_match.group(1).strip() data['short_title']=full_title_match.group(3).strip() pn=full_title_match.group(6) if pn is not None: data['postnominal']=pn.strip() else: data['postnominal']=None else: sqlite.save(unique_keys=['full_title'], data=data, table_name='full_title_anomalies') print("Anomaly {0}".format(repr(data))) continue top_match=back_to_top_pattern.search(html, pos) if top_match: biography_html=html[pos:top_match.start()] pos=top_match.end() else: biography_html=html[pos:] pos=len(html) data['biography_html']=biography_html yield(data)
def get_item_tree(root_node): caters = root_node.xpath('//a[@class="level1"]/@href') caters_name = root_node.xpath('//a[@class="level1"]/text()') for (cat,name) in zip(caters,caters_name): val = int(re.findall('id_seccion=([0-9]*)',cat)[0]) save(['id'],{'id':val,'name':name.encode('latin-1'),'parent':0,'leaf':0},table_name='categorias') get_children_from_cat(root_node,val)
def saveFilingTable(url): muffin = scrape(url) banana = fromstring(muffin) tea = banana.cssselect('table') you=tea[2] marcus = you.cssselect('tr') for jay in marcus[1:]: tractor = jay.cssselect('td,th') aidan = [apple.text_content() for apple in tractor] #print COLNAMES #print aidan #print zip(COLNAMES, aidan) data = dict(zip(COLNAMES, aidan)) data['state']=data['location'].strip()[0:2] data['num_workers']=int(data['num_workers']) assert data['state'] in [ 'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'] save([],data)
def saveFilingTable(url): muffin = scrape(url) banana = fromstring(muffin) tea = banana.cssselect('table') you=tea[2] marcus = you.cssselect('tr') for jay in marcus[1:]: tractor = jay.cssselect('td,th') aidan = [apple.text_content() for apple in tractor] data = dict(zip(COLNAMES,aidan)) print(data) data['state']=data['location'].strip()[0:2] data['num_workers']=int(data['num_workers']) if data['state'] not in [ 'Na', 'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']: data['state']='unknown' v = map(int,data['expiration_date'].split('-')) data['expiration_date'] = datetime.date(v[2]+2000,v[0],v[1]) #data['expiration_date'] = datetime.datetime.strptime( # data['expiration_date'], # '%m-%d-%y' #).date() print(data) save([],data)
def scrape(): d = getTopics(BOARD) save(['topic-href'], d, 'topics') topic_hrefs = [row['topic-href'] for row in d] for topic_href in topic_hrefs: d = parseTopic(topic_href) save([], d, 'links')
def getpdfs(): html = parse('http://www.safaricom.co.ke/index.php?id=275').getroot() html.make_links_absolute() pdf_urls = html.xpath('//table[@class="contenttable" and @width="540"]/descendant::a/@href') for url in pdf_urls: save(['date_scraped', 'url'], {"date_scraped": DATE, "url": url, "pdfxml": pdftoxml(urlopen(url).read())}, 'pdfs')
def grab(startitem=1,extracolumns={},oncompletion=_oncompletion_default): #Grab _print('Downloading') xml=get_search_page(startitem) #Parse _print('Parsing') rows=parse(xml) #Add some new information search_id=copy(startitem) for row in rows: #Identifiers we know which items we've scraped. row['search_id']=search_id search_id=search_id+1 #Any extra information row.update(extracolumns) #Save to the datastore save([],rows,'directory') #Recurse if is_last_page(xml): oncompletion() else: _print("Finished items "+' to '.join(map(str,current_items(xml)))+' of '+str(matched_items(xml))) _print("Searching for items "+str(startitem)+" to "+str(startitem+5)) grab(startitem+5,extracolumns,oncompletion)
def deep_scrape(urn): data = {} def merge_in(d): "update data with d; complain if anything is overwritten" for (k, v) in d.iteritems(): if k in data: assert data[k] == v, "%s: [%s] != [%s]" % (k, data[k], v) else: data[k] = v merge_in(summary_scrape(urn)) merge_in(page_scrape("general", urn)) merge_in(page_scrape("communications", urn)) merge_in(page_scrape("regional-indicators", urn)) try: if "Headteacher" not in data: data["Headteacher"] = "".join( [data["Headteacher Title"], data["Headteacher First Name"], data["Headteacher Last Name"]] ) if data["Easting"] == "" or data["Northing"] == "": raise Exception("No Location Data") data = {key: data[key] for key in keys_to_keep} sqlite.save(unique_keys=["URN"], data=data) except Exception as e: print "Error: " + e.message
def process (name, date): newdate = date[8:10] + "_" + date[5:7] + "_" + date[0:4] url = r"http://www.lloydsbankinggroup.com/media/excel/2010/%s_historic_data.xls" % newdate print url url = r"http://www.lloydsbankinggroup.com/media/excel/2010/04_06_10_historic_data.xls" book = xlrd.open_workbook(file_contents=scrape(url)) sheet = book.sheet_by_name (name) months=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'] data = [] i = 1 while i < 500: try: month = sheet.cell_value (i, 0) year = sheet.cell_value (i, 1) level = sheet.cell_value (i, 2) except: break when= "%04d-%02d-01" % (int(year), months.index (month) + 1) i = i + 1 data.append (level) sqlite.save(unique_keys=["Date"], data={"Date":when, "Index":level}) chart = SimpleLineChart(500, 255, y_range=[0, 700]) chart.add_data (data) metadata.save("chart", chart.get_url())
def main(): """Check what has been scraped so far, then resume. It might be good to check for gaps in the scraping. Or maybe a recursive approach isn't the best for search pages like this.""" #What's already been scraped recently? if not 'directory' in show_tables(): last_searched=0 else: #Only skip things from the current scraper completion attempt. if 'scrape_completions' in show_tables(): raw_ids=select('scrape_ids from scrape_completions order by completion_id desc limit 1')[0]['scrape_ids'] max_to_ignore=max(map(int,raw_ids.split(','))) min_to_scrape=max_to_ignore+1 else: min_to_scrape=1 incomplete_scrape=select('max("search_id") as m from directory where scrape_id>='+str(min_to_scrape))[0]['m'] if incomplete_scrape!=None: last_searched=incomplete_scrape else: last_searched=0 if 'scrape_times' in show_tables(): last_id=select('max("scrape_id") as m from scrape_times')[0]['m'] else: last_id=0 #Time of scrape start scrape_id=last_id+1 save(['scrape_id'],{"scrape_id":scrape_id,"scrape_time":time()},'scrape_times') grab(last_searched+1,{"scrape_id":scrape_id},oncompletion=oncompletion)
def locationscraper(locationpagesource, idcount): address = "not available" storeid = "error" latlong = "none" phone = "none" postal = "none" city = re.search('BR/>(.+?),', locationpagesource, re.I) city = city.group(1) city = re.sub("<BR/>", ", ", city) storeid = idcount latlong = re.search("init\((.+?),'<", locationpagesource) latlong = re.sub("'", "", latlong.group(1)) if re.search('\(\d{3}\) \d{3}-\d{4}', locationpagesource, re.DOTALL|re.S|re.I): phone = re.search('\(\d{3}\) \d{3}-\d{4}', locationpagesource, re.DOTALL|re.S|re.I) phone = phone.group(0) if re.search('\D\d\D \d\D\d', locationpagesource): postal = re.search('\D\d\D \d\D\d', locationpagesource) postal = postal.group(0) row_data = {'Address': city, 'Latlong': latlong, 'Phone': phone, 'Postal Code': postal, 'Store ID': storeid} try: save([],row_data) except: city = "Address unavailable" row_data = {'Address': city, 'Latlong': latlong, 'Phone': phone, 'Postal Code': postal, 'Store ID': storeid} save([],row_data)
def Main(): page = urllib2.urlopen("http://www.london.gov.uk/who-runs-london/greater-london-authority/expenditure-over-1000") soup = BeautifulSoup(page) for link in soup.html.body.findAll('a',{'href': re.compile(r'(csv)$')}): quotedLink = link['href'].replace(' ','%20') report = urllib2.urlopen(quotedLink).readlines() headerLine = findHeaderLine(report) reader = csv.DictReader(report[headerLine:]) for rowNumber, row in enumerate(reader): #print row amount, currency = fixAmount(tryGetColumn(row,'Amount') or tryGetColumn(row,'Amount £') or tryGetColumn(row,'Amount Paid') or tryGetColumn(row, 'Amount\n\xa3') or tryGetColumn(row, 'Amount\n\x9c')) data = { 'link' : quotedLink, 'rowNumber' : rowNumber, 'supplier' : tryGetColumn(row,'Supplier') or tryGetColumn(row,'Vendor') or tryGetColumn(row,'Vendor Name'), 'amount' : amount, 'currency' : currency, 'description' : tryGetColumn(row,'Expense Description') or tryGetColumn(row,'Expenditure Account Code Description'), 'docType' : tryGetColumn(row,'Doc Type'), 'docNumber' : tryGetColumn(row,'Doc No') or tryGetColumn(row,'SAP\nDocument No'), 'date' : fixDate(tryGetColumn(row,'Date') or tryGetColumn(row,'Clearing \nDate')) } if data['supplier'] and data['amount'] and data['description'] and data['amount']!='Amount Paid': sqlite.save(['link','date', 'amount', 'supplier'],data, date=data['date'])
def main(): #What has already been scraped if 'contributions' in show_tables(): scraped=[row['querystring'] for row in select('querystring from contributions')] else: scraped=[] pagenumber=0 while True: pagenumber=pagenumber+1 xml=load(pagenumber) #Get the header row rows=xml.xpath('//table[@class="table_text"][tr[@class="tan_row"]]')[0].getchildren()[1:] keys=['name','contestant_party_district','date_received','class_and_partnum','association','monetary','non-monetary'] #Get the data rows ds=[] d={} for row in rows: cells=row.getchildren() contributor=cells.pop(0).getchildren()[0] d['querystring']=contributor.attrib['href'].replace("javascript:PopUp('contributor.aspx?",'').replace("', '300', '300');",'') d[keys[0]]=contributor.text for i in range(1,len(cells)): d[keys[i]]=cells[i].text ds.append(d) #Don't run again if already run if ds[0]['querystring'] in scraped: break else: save(['querystring'],ds,'contributions')
def parse(self, text): cleaned_text = text.replace('\n','').replace('\r','').replace('\t','') html = fromstring(cleaned_text) tds=html.xpath('//td[a]') branches=[branchinfo(td) for td in tds] for row in branches: row['cityUrl'] = self.url splitchar = '\n' if row['address'].count('\n') > 0 else ',' splitaddress=row['address'].split(splitchar) l=len(splitaddress) if l==3: row['street-address'],row['subtown'],row['town2']=splitaddress elif l==2: row['street-address'],row['subtown']=splitaddress elif splitaddress == ['']: print 'Empty address' else: print row['map_Address_'] print splitaddress raise ValueError('Weird address') if row.has_key('street-address'): row['street-address'] = row['street-address'].strip() row['address'] = strip_address(row['address']) row['scraperrun'] = scraperrun save(['scraperrun', 'cityUrl'], branches,'branches')
def apples(): download = urlopen('http://www.dol.gov/olms/regs/compliance/cba/Cba_CaCn.htm') rawhtml = download.read() html = fromstring(rawhtml) tables = html.cssselect('table') table = tables[2] trs = table.cssselect('tr') for tr in trs[1:]: tds = tr.cssselect('td') cell_values = [td.text_content() for td in tds] data = dict(zip(COLUMN_NAMES, cell_values)) print data #state if data['location'][:2] in STATES: data['state'] = data['location'][:2] data['num_workers'] = int(data['num_workers']) data['expiration_date'] = datetime.datetime.strptime(data['expiration_date'], '%m-%d-%y').date() links = tr.cssselect('a') if len(links) == 1: data['pdf'] = 'http://www.dol.gov/olms/regs/compliance/cba/' + links[0].attrib['href'] elif len(links) > 1: assert False print data save([], data)
def locationscraper(locationpagesource, idcount): address = "not available" storeid = "error" lat = "none" longitude = "none" phone = "None available" country = "Not listed" city = re.search('class="locality">(.+?)<', locationpagesource, re.DOTALL|re.S) city = re.sub(' ', '', city.group(1)) city = re.sub(',', '', city) if re.search('class="street-address"', locationpagesource): address = re.search('class="street-address">(.+?)<', locationpagesource, re.DOTALL|re.S) address = address.group(1) if re.search('class="country-name">(.+?)<', locationpagesource): country = re.search('class="country-name">(.+?)<', locationpagesource) country = country.group(1) storeid = idcount lat = re.search('data-store-lat="(.+?)"', locationpagesource) lat = lat.group(1) longitude = re.search('data-store-lon="(.+?)"', locationpagesource) longitude = longitude.group(1) if re.search('\d{3}-\d{3}-\d{4}', locationpagesource, re.DOTALL|re.S|re.I): phone = re.search('\d{3}-\d{3}-\d{4}', locationpagesource, re.DOTALL|re.S|re.I) phone = phone.group(0) row_data = {'Address': address, 'City': city, 'Country': country, 'Lat': lat, 'Long': longitude, 'Phone': phone, 'Store ID': storeid} save([],row_data)
def parse_branch(xml,url,region): #Get table max_trs=max([table.xpath('count(tr)') for table in xml.xpath('//table')]) table_nodes=xml.xpath('//table[count(tr)=%d]'%max_trs) #Check l=len(table_nodes) if l!=1: raise ParseError("I could not identify the appropriate table; %d candidates were found." % l) else: table=table_nodes[0] #Parse #from lxml.html import tostring #print tostring(table) d=parse_branch_table(table) d=parse_branch_table_strings(d) for row in d: row['date_scraped']=DATE row['region']=region row['url']=url row['full-address'] = strip_address(row['full-address']) row['street-address'] = strip_address(row['street-address']) #print [row.keys() for row in d] save([],d,'branches')
def analyze(): d = select(""" `link-href`, GROUP_CONCAT(`author`) AS `authors`, count(*) AS "count" FROM `links` JOIN `topics` ON `links`.`topic-href` = `topics`.`topic-href` GROUP BY `link-href` """) execute('DROP TABLE IF EXISTS `wrote-about-same-things`') save([], d, 'wrote-about-same-things') print ''' These look most exciting because three different people wrote about each. 3 Kiana Fitzgerald,Sara Peralta,Susan Raybuck http://schedule.sxsw.com/2012/events/event_IAP100409 3 Shawn Dullye,Joe Vasquez,Sara Peralta http://schedule.sxsw.com/2012/events/event_IAP10593 3 Shawn Dullye,Kiana Fitzgerald,Sara Peralta http://schedule.sxsw.com/2012/events/event_IAP13848 Of course, that isn't adjusted for how many each person wrote. ''' d = select(""" author, count(*) AS `how-many` FROM `links` JOIN topics on links.`topic-href` = topics.`topic-href` GROUP BY author ORDER BY 2 DESC """) save(['author'], d, 'how-many-did-you-link') print """
def parse(self, text): html = fromstring(text) citiesParent = html.xpath('//select') #This should actually have an option child, but lxml fixes the wrong html assert len(citiesParent)==1 cities=options(citiesParent[0],valuename="cityId",textname="cityName",ignore_value="0") for city in cities: city['provinceUrl'] = self.url city['cityUrl'] = URLS['cities-base'] + city['cityId'] city['scraperrun'] = scraperrun save(['cityUrl', 'scraperrun'], cities, 'cities') return [City(c['cityUrl']) for c in cities]
def download(): d=[] for letter in ascii_lowercase: x=search_letter(letter) branch_tables=x.cssselect('table.locatorTable table') d_letter=[extract_branch_info(branch_table) for branch_table in branch_tables] for record in d_letter: record['url']=searchurl(letter) record['date_scraped']=DATE d.extend(d_letter) save([],d)
def scrape_pct(link,pct_name): """ Scrapes the data associated with the PCT, and calls functions to scrape data associated with the services. """ url = "http://www.nhs.uk" + link root = lxml.html.parse(url).getroot() d = {} # basic contact details d["PCT"] = pct_name d["type"] = "main" d["name"] = pct_name print lxml.html.tostring(root) address = root.cssselect("div.panel-content div.pad p")[0].text d["address"] = address d["postcode"]= geo.extract_gb_postcode(address) try: d["lat"], d["lng"] = geo.gb_postcode_to_latlng(d["postcode"]) except: print "Postcode not found", d["postcode"] d["info HTML"] = url colour = "green" # quality for t in root.findall("body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']"): k = t.find("div/h4").text.strip() v = t.find("div/img").attrib["alt"] d[k] = v if k == "Fair": colour = "yellow" d["colour"] = colour # head honcho for t in root.findall("body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']"): d["Boss"] = t.text.replace("<br />",", ") # boring text for t in root.findall("body/div/form/div/div/div/div/div/div/p"): if t.text: if t.attrib.get("class",False)=="intro": d["intro text"] = t.text else: d["boilerplate"] = (d.get("boilerplate","")+"\n"+t.text).strip() sqlite.save(unique_keys=["PCT","type","name"], data=d) scrape_facilities(pct_name,root) scrape_others(pct_name,url)
def main(): out = [] #outfile = open('rwandamicrofinance.json','w') kinds = {('7','123'): 'MFI', ('8','124'): 'Unions', ('9','125'): 'SARL', ('10','126'): 'SA'} for (k1,k2),v in kinds.iteritems(): start = 'index.php?option=com_sobi2&catid=%s&Itemid=%s&lang=en'%(k1,k2) result, n = eachpage(start, v) out.extend(result) while n!=None: result,n = eachpage(n, v) out.extend(result) for row in out: row['date_scraped'] = DATE save([], out)
def ScrapeProfile(site, link): page = urlopen(link) rawtext = page.read() html = fromstring(rawtext) print tostring(html) tables = html.cssselect('table') #3 tables on profile, all with valid information try: imgs = html.cssselect('img') data = {'image_link': site + imgs[1].attrib['src']} except IndexError: print "image not available" data = {'image_link': 'None'} divinfo = html.cssselect("#inmateAddress") address = str(divinfo[0].text_content().strip()) address = address[5:].strip() data['address'] = address divinfo = html.cssselect("#holdingLocation") location = str(divinfo[0].text_content().strip()) location = location[17:] data['location'] = location Table_HEADERS = [['id','name','book_date'], ['age','height','weight','race', 'sex', 'eye', 'hair'], ['case_num', 'description', 'bond_amount', 'bond_type']] for i in range(2): tabledata = [] for tr in tables[i].cssselect('tr'): #this table contains ID, NAME, BOOKDATE cellvalues = [td.text_content().strip() for td in tr.cssselect('td')] tabledata.extend(cellvalues) data = dict(data.items() + dict(zip(Table_HEADERS[i], tabledata)).items()) for tr in tables[2].cssselect('tr')[1:]: #Table 2 contains case number(s), description and cash or Bond type cellvalues = [td.text_content().strip() for td in tr.cssselect('td')] data1 = dict(zip(Table_HEADERS[2], cellvalues)) data3 = dict((data.items() + data1.items())) data3['age'] = int(data3['age']) data3['weight'] = int(data3['weight']) data3['id'] = int(data3['id']) data3['bond_amount'] = int(data3['bond_amount'].strip('$').replace(',','')) data3['book_date'] = datetime.datetime.strptime(data['book_date'], '%m/%d/%Y %I:%M %p').date() #print data3 data3['id_CASENUM'] = str(data3['id']) +'_' + data3['case_num'] +'_' + data3['description'][:6] #used for unique key print data3['id_CASENUM'] save(['id_CASENUM'],data3)
def parse(url,nodeIds): xml=get(url) for a in xml.xpath('//div[@class="profile-container"]/div[@class="node-body"]/a'): nodeId=a.attrib['href'].split('/')[-1] if nodeId in nodeIds: #Remove it to speed up future runs nodeIds.pop(nodeId) else: #Add it to the database d={ "nodeId":nodeId , "first_scraped":time() } save(['nodeId'],d,'nodeIds')
def branchinfo(href): x = fromstring(urlopen(DIR + href).read()) for thingtype in ("Sub-Branches", "Outlets"): locations = [ loc.strip() for loc in x.xpath( '//p[strong/text()="%s"]/following-sibling::p[position()=1]/text()' % thingtype) ] d = [{ "location": location, "date_scraped": DATE, "branch-href": href } for location in locations] save([], d, "branch_" + thingtype)
def save_page(url,table_name="pages"): "Save a url directly to the datastore." try: handle=urlopen(url) text=handle.read() except urllib2_URLError as e: badurl(url,e) except HTTPError as e: badurl(url,e) except BadStatusLine as e: badpage(url,e) else: d={"url":url,"text":text} save(['url'],d,table_name)
def descend(self,selectIndex=0): """Traverse the form fields in a depth-first fashion. Sometimes, a form will provide no responses, but this isn't actually a problem because the loop just does nothing in that case.""" select=self.SELECTS[selectIndex] options=getattr(self,'extract_%s' % select['plural'])() save([],options,select['plural']) for option in options: getattr(self,'submit_%s' % select['singular'])(option['value']) if self.is_megabus(): option['is_megabus']=True save([],option,select['plural']) elif selectIndex < len(self.SELECTS)-1: self.descend(selectIndex+1)
def main(): blocks=get_blocks() blockId=0 for block in blocks: blockId+=1 block_info=block.data() block_info['blockId']=blockId block_info['date_scraped']=DATE save([],block_info,'blocks') for branch in block.branches(): branch_info=branch.data() branch_info['blockId']=blockId branch_info['date_scraped']=DATE save([],branch_info,'branches')
def main(): #Load xml=swimport('dsp').dsp('http://www.khula.org.za/Admin/Contacts/RegionalContacts.aspx',False) #Parse t_nodes=xml.xpath('//table[@width="100%"]') assert len(t_nodes)==1 table=t_nodes[0] d=parse_table(table) t=time() for row in d: row["date_scraped"]=t d=moreparsing(d) save([],d,'final')
def parse(url,xml=None,suffix=''): if xml==None: xml=pull(url) sunday=xml.xpath('//h2[@class="date-header"]')[0].text twosided=xml.xpath('//div[@class="flipit"]/a[@onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}"]') #Get the postcards postcards=xml.xpath('//div[@class="flipit"]/a[@onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}"][2]') for postcard in xml.xpath('//a[@onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}"]'): if not postcard in twosided: postcards.append(postcard) for a in postcards: if not 'bp.blogspot.com' in a.attrib['href']: #Not a postcard break save(["url","image"],image(a.attrib['href']),"images"+suffix) if _isTwosided(a): url2=a.getprevious().attrib['href'] save(["url","image"],image(url2),"images"+suffix) save(["url1","url2"] , meta(a,sunday,url2=url2) , "postcards"+suffix) else: save(["url1"] , meta(a,sunday) , "postcards"+suffix)
def main(): Fall2012 ='https://www.american.edu/provost/registrar/schedule/schedule-results.cfm?term=2012FN&subj=&search=&mode=title&stat=ALL&hr=&mn=&m=AM&class=Search+Courses' Fallpage = urlopen(Fall2012) rawtext = Fallpage.read() html = fromstring(rawtext) print tostring(html) COURSE_KEYS = ['CourseNum', "Title", "Prerequisite", "Course Description"] SECTION_KEYS = ['Status','section','credit','instructor','time'] maindivs = html.cssselect(".crs-data") for crs in maindivs: COURSEdata = [] header =crs.cssselect('.crs-header')[0] secs = crs.cssselect('.sec-details') #print tostring(header[0]), tostring(secs[0]) headerdivs = header.cssselect('div')[1:] COURSEdata.extend([div.text_content().strip() for div in headerdivs[:2]]) if len(headerdivs)==5: prereq = headerdivs[3].text_content().strip() COURSEdata.append(prereq[14:]) else: COURSEdata.append('') descriptionlink = 'https://www.american.edu/provost/registrar/schedule/' + headerdivs[2].cssselect('a')[0].attrib['href'] descriptionrawtext = fromstring(urlopen(descriptionlink).read()) try: COURSEdata.append(descriptionrawtext.cssselect('.course-header')[0].cssselect('p')[1].text_content().strip()) except: COURSEdata.append('NONE') COURSEdata = dict(zip(COURSE_KEYS,COURSEdata)) #print COURSEdata for sec in secs: SECdata = [] sectionDivs = sec.cssselect('div')[1:] SECdata.append(sectionDivs[0].text_content().strip()) #status SECdata.append(sectionDivs[1].text_content().strip()) #sectionNum SECdata.append(sectionDivs[3].text_content().strip()) #credits SECdata.append(str(sectionDivs[4].text_content().strip())) #professor SECdata.append(str(sectionDivs[8].text_content().strip())) #times SECdata = dict(zip(SECTION_KEYS,SECdata)) SECdata = dict(COURSEdata.items() + SECdata.items()) #SECdata['section'] = int(SECdata['section']) #SECdata['credit'] = int(SECdata['credit']) SECdata['key'] = SECdata['CourseNum'] + str(SECdata['section']) #used as unique key save(['key'],SECdata)
def Main(): page = urllib2.urlopen( "http://www.london.gov.uk/who-runs-london/greater-london-authority/expenditure-over-1000" ) soup = BeautifulSoup(page) for link in soup.html.body.findAll('a', {'href': re.compile(r'(csv)$')}): quotedLink = link['href'].replace(' ', '%20') report = urllib2.urlopen(quotedLink).readlines() headerLine = findHeaderLine(report) reader = csv.DictReader(report[headerLine:]) for rowNumber, row in enumerate(reader): #print row amount, currency = fixAmount( tryGetColumn(row, 'Amount') or tryGetColumn(row, 'Amount £') or tryGetColumn(row, 'Amount Paid') or tryGetColumn(row, 'Amount\n\xa3') or tryGetColumn(row, 'Amount\n\x9c')) data = { 'link': quotedLink, 'rowNumber': rowNumber, 'supplier': tryGetColumn(row, 'Supplier') or tryGetColumn(row, 'Vendor') or tryGetColumn(row, 'Vendor Name'), 'amount': amount, 'currency': currency, 'description': tryGetColumn(row, 'Expense Description') or tryGetColumn(row, 'Expenditure Account Code Description'), 'docType': tryGetColumn(row, 'Doc Type'), 'docNumber': tryGetColumn(row, 'Doc No') or tryGetColumn(row, 'SAP\nDocument No'), 'date': fixDate( tryGetColumn(row, 'Date') or tryGetColumn(row, 'Clearing \nDate')) } if data['supplier'] and data['amount'] and data[ 'description'] and data['amount'] != 'Amount Paid': sqlite.save(['link', 'date', 'amount', 'supplier'], data, date=data['date'])
def main(): if not 'cities_done' in show_tables(): cities_done=[] else: cities_done=select('* from cities_done') for fromcity in CITIES_NY: for tocity in CITIES_NY: if fromcity==tocity: print 'Skipping within-%s route' % fromcity elif {"from":fromcity,"to":tocity} in cities_done: print 'Already scraped %s to %s' % (fromcity,tocity) else: grab(fromcity,"NY",tocity,"NY") save([],{"from":fromcity,"to":tocity},'cities_done')
def geocode(): if "address" not in show_tables(): initialize() while select('count(*) AS "c" FROM `address` WHERE `finished` = 0')[0]['c'] > 0: address = select("`address-column`, `address-input` FROM `address` WHERE `finished` = 0 LIMIT 1")[0] #print address if select('count(*) AS "c" FROM `geocode` WHERE `address-input` = ?', [address['address-input']])[0]['c'] == 0: d = all_services(address['address-input']) for row in d: row['address-input'] = address['address-input'] save([], d, 'geocode') params = (address['address-column'], address['address-input']) execute("UPDATE `address` SET `finished` = 1 WHERE (`address-column` = ? AND `address-input` = ?)", params ) commit()
def parse(self, text): text = '\n'.join(text.split('\n')[2:4]).replace("document.getElementById('bizdir_directory').innerHTML = '", '') text = re.sub(r"';\s*document.getElementById('bizdir_search').disabled = false;", '', text).replace(" 8</div>';", ' 8</div>').replace("\\'", '') html = fromstring(text) bizdir_directory = [] for tr in html.cssselect('#bizdir_directory tr'): try: assert tr.xpath('count(td)') == 1 name = element(tr, 'td/b').text_content() description = element(tr, 'td/p/text()') bizdir_directory.append({'name': name, 'description': description, 'pageOffset': self.offset, 'scraperrun': scraperrun}) except: print tostring(tr) raise save(['scraperrun', 'pageOffset', 'name'], bizdir_directory, 'organizations')
def join(): disclosures=select('Entity,upper(Entity) as "ENTITY" from disclosures where entity is not null') disclosures_cleaned=[{ "raw":row['Entity'] , "clean":remove_ny(row['ENTITY']).strip() } for row in disclosures] save([],disclosures_cleaned,'disclosures_cleaned') licenses=select('Vendor,upper(Vendor) as "VENDOR" from swdata where Vendor is not null') licenses_cleaned=[{ "raw":row['Vendor'] , "clean":remove_ny(row['VENDOR']).strip() } for row in licenses] save([],licenses_cleaned,'licenses_cleaned')
def moreparsing_map(): "Map along the most recent results in the table (like a Couch map) and return a new one" d=select("* FROM `swdata` WHERE date_scraped=(SELECT max(date_scraped) from `swdata`);") for row in d: row['street-address'],row['postal-code']=splitAddress(row['Address_']) row['town']=extractTown(row['branchName']) if 'final' in show_tables(): execute('DROP TABLE `final`;') d_final = [] for row in d: if row['regionName'] not in ["Botswana", "Malawi", "Nambia"]: d_final.append(row) save([],d_final,'final')
def find_similar_research(): research = select('url, value from maincol where url != ?;', [reference_person]) research.extend(select('url, value from descriptions where url = ?;', [reference_person])) documents = [row['value'].strip() for row in research] stoplist = set('for a of the and to in'.split()) texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] vec = corpus.pop() #The person being compared to tfidf = models.TfidfModel(corpus) index = similarities.SparseMatrixSimilarity(tfidf[corpus]) sims = index[tfidf[vec]] save(['url'], [{"url": row[0], "similarity": row[1][0]} for row in zip([row['url'] for row in research], list(enumerate(sims)))], 'similarity')
def iter_children_areas_kml(parent_id): children = getjs('http://mapit.mysociety.org/area/%s/children' % parent_id) if 'error' in children: raise RuntimeError(children['error']) db.save(['parent_area'], {'parent_area': parent_id, 'count': len(children)}, table_name="counts", verbose=0) for id, data in children.items(): kml = requests.get('http://mapit.mysociety.org/area/%s.kml' % id).content if POLYGON_ONLY: kml = extract_polygon(kml) entry = {'parent_area': int(data['parent_area']), 'id': int(id), 'name': data['name'], 'kml': kml} yield entry time.sleep(SLEEP_TIME)
def exceeded(runId): print "---------------------------------------------------------" print "Wow, we caught the exception." print "Printing the current time so we see how long we have" start_time = time() while True: current_time = time() time_after_exception = current_time - start_time save( [], { "time_after_exception": time_after_exception, "time": current_time, "runId": runId }) long(812323525)**long(624333) sleep(1)
def cp1(): p=Page('CP1') while p.lastpage()==False: tables=p.table().subtables() d = [] for table in tables: row = table.parse() row['business_premises'] = table.business_premises() d.append(row) print row more_cleaning(d,p.pagenum) save([],d,'cp1') randomsleep() p=p.next25()
def main(): x = fromstring(urlopen(URL).read().replace('<br />','\n').replace(' ',' ')) ps = x.xpath('//td[@width="596"]/p') d = [] for p in ps: text = p.text_content() lines = [line.strip() for line in text.split('\n')] row = {"entity":lines.pop(0)} for line in lines: line = line.replace(' : ', '') if line != '': key, value = line.split(':') row[key] = value row['date_scraped'] = DATE d.append(row) save([], d)
def parse_entry(trs): """Given the full list of trs, extract data from the first database entry and remove the first database entry from the trs.""" d={} tr=trs.pop(0) while (not is_entry_divider(tr)) and len(trs)>0: pairlist=tr.xpath('descendant::*[self::font or self::a]/text()') if len(pairlist)!=2: _print("Extraction of this key-value pair was less standard.") _print(pairlist) save(['pair'],{"time":time(),"pair":'|'.join(pairlist)},'nonstandard_pairs') key=pairlist[0] value=''.join(pairlist[1:]) d[keyify(key)]=value tr=trs.pop(0) return d
def main(): if None==get_var('DATE'): save_var('DATE',time()) searchTerms=get_searchTerms() for searchTerm in searchTerms: d=paginate(searchTerm) for row in d: row['date_scraped']=get_var('DATE') row['searchTerm']=searchTerm save_var('previous_searchTerm',searchTerm) save(['date_scraped', 'Name'],d,'initial') save_var('previous_searchTerm',None) save_var('DATE',None)
def separate_addresses(): execute('DROP TABLE IF EXISTS final') commit() d = select('* from `initial`') for row in d: splitaddress = row['address'].split('\n') l = len(splitaddress) if l == 3: row['street-address'], row['subtown'], row['town2'] = splitaddress elif l == 2: row['street-address'], row['subtown'] = splitaddress else: raise AddressError row['street-address'] = row['street-address'].strip() row['address'] = strip_address(row['address']) save([], d, 'final')
def parse_and_save(self): job=self.parse_main() lobbyists=self.parse_lobbyists() detail=self.parse_detail() save(['jobId'],job,'lobbyists',verbose=False) save([],lobbyists,'lobbyists_lobbyists',verbose=False) save([],detail,'lobbyists_details',verbose=False) save(['jobId','sourceUrl'],{"jobId":self.jobId,"sourceUrl":self.url},'lobbyists_urls')
def cp1(): execute(''' CREATE TABLE IF NOT EXISTS `businessPremises` ( `date_scraped` REAL, `businessPremisesURL` TEXT, FOREIGN KEY (date_scraped, businessPremisesUrl) REFERENCES cp1(date_scraped, businessPremisesUrl) ) ''') if get_var('crashed') == 1: pagenum = select('max(pagenum) from cp1 where date_scraped = (select max(date_scraped) from cp1)')[0]['max(pagenum)'] print "Resuming from page %d" % pagenum p = Page('CP1') p = Page('CP1', s=p.s, pagenum=pagenum) else: print "Starting a new run" p = Page('CP1') while p.lastpage()==False: print "Beginning page %d" % p.pagenum tables=p.table().subtables() d = [] for table in tables: row = table.parse() row['businessPremisesURL'] = table.business_premises_url() try: business_premises_data, more_registrant_data = table.business_premises(p.s) except Exception, msg: print "Error on %s: msg" % table.business_premises_url() sleep(60) print "Trying again" business_premises_data, more_registrant_data = table.business_premises(p.s) row['date_scraped']=DATE row['pagenum']=p.pagenum row['url']=URL+"?page=%d"%p.pagenum row.update(more_registrant_data) save([], business_premises_data, 'businessPremises') save(['date_scraped', 'businessPremisesURL'],row,'cp1') sleep(1) save_var('crashed', 1) p=p.next25()
def parse(): d = select('* from raw') for row in d: lines = row['contact-info'].split('Email')[0].split('\n') row['town'] = re.findall(r'[A-Z]+', lines[0])[0] for cell in lines: if 'B.P' == cell[0:3]: row['street-address'] = cell elif 'Email' in cell: row['email'] = cell elif 'Mob' in cell: row['mob'] = cell elif 'Phone' in cell: row['phone'] = cell else: print cell save(['date-scraped','branch-number'],d,'parsed')