def scrape_nyt(self): urls = scrapemark.scrape(""" <body> {* <div class='element2'> <h3> <a href='{{ [links].url }}'></a> </h3> </div> *} </body> """, url='http://www.nytimes.com/most-popular-emailed')['links'] urls += scrapemark.scrape(""" <body> {* <div class='element2'> <h3> <a href='{{ [links] }}'></a> </h3> </div> *} </body> """, url='http://www.nytimes.com/most-popular-viewed')['links'] urls += scrapemark.scrape(""" <body> {* <div class='element2'> <h3> <a href='{{ [links] }}'></a> </h3> </div> *} </body> """, url='http://www.nytimes.com/most-popular-blogged')['links'] return urls
def main(): links = scrape(PATTERN, url=URL) print links #done= set([res['nzgls_identifier'] for res in sw.sqlite.select('nzgls_identifier FROM bills')]) #print done for link in links['sources']: bills = scrape(PATTERN2, url=link)['bills'] print bills for bill in bills: print bill try: bill = scrape(INDIVIDUAL_BILL, url=bill) except Exception, e: print "DEBUG: %s" % e continue bill['link'] = link do_details(bill) do_meta(bill) do_related(bill) for related_doc in bill['related']: related_doc['nzgls_identifier']=bill['nzgls_identifier'] related_doc['bill']=bill['title'] sw.sqlite.save(['link'], data=bill['related'], table_name='related_docs') cleanup(bill) sw.sqlite.save(['link', 'valid_from'], data=bill, table_name='bills')
def parse_swift_code_page(url, country_name, queue=Q): if url in DONE: return None print 'downloading', country_name raw = get_country_html(url) banks = scrape(SWIFT_CODE_PATTERN, html=raw)['banks'] for bank in banks: bank['address'] = cleanup_address(bank['address']) bank['country_name'] = country_name bank['source'] = url sqlite.save(['swift_code'], data=banks, table_name='swift_codes') if 'page=' not in url: try: n_pages = max( int(link.split('=')[-1]) for link in scrape(PAGINATION_PATTERN, html=raw)) pages = [ BASE_URL + '/swift-code/search-swift-complete.php?country=%s&page=%d' % (country_name.replace(' ', '%20'), n) for n in xrange(n_pages) ] except ValueError: #no more pages pages = [] for newurl in pages: queue.push((parse_swift_code_page, newurl, country_name)) DONE.add(url) sqlite.save(['url'], table_name='_done', data=dict(url=url))
def scrape_topsy(self): urls = scrapemark.scrape(""" <body> <div class="list"> {* <h3 class="title"> <a href='{{ [links].url }}'></a> </h3> *} </div> </body> """, url='http://topsy.com/top100')['links'] for page, offset in enumerate([15,30,45,60,75,90,105,120,135]): urls += scrapemark.scrape(""" <body> <div class="list"> {* <h3 class="title"> <a href='{{ [links].url }}'></a> </h3> *} </div> </body> """, url='http://topsy.com/top100?offset='+str(offset)+'&om=f&page='+str(page+1)+'&thresh=top100')['links'] return urls
def extract_construction_profs(details): blanket = "<h6>Construction Professionals</h6>{{ pros|html }}" "" targetted = """<h6>Construction Professionals</h6>{* <a href="javascript:viewBio('{{ [pros.id] }}');">{{ [pros].name }} </a> *}""" pros = scrape(targetted, html=details)['pros'] if not pros: pros = scrape(blanket, html=details)['pros'] return 'pros', pros
def extract_construction_profs(details): blanket = "<h6>Construction Professionals</h6>{{ pros|html }}""" targetted = """<h6>Construction Professionals</h6>{* <a href="javascript:viewBio('{{ [pros.id] }}');">{{ [pros].name }} </a> *}""" pros = scrape(targetted, html=details)['pros'] if not pros: pros = scrape(blanket, html=details)['pros'] return 'pros', pros
def corporation_registration(page): pattern = """Corporation:{{ name }}Name change history Responsible Officer:{{ responsible_officer_name }} Position Title: {{ responsible_officer_name }} Version:{{ registration_id }} Type:{{ registration_type }} Active from:{{ registration_active_from_date }} Activity last confirmed:{{ registration_last_confirmed_date }} A. Information about Responsible Officer and Corporation Corporation:{{ corporation_name }} Telephone number:{{ corporation_phone }} Fax number:{{ corporation_fax }} Description of the corporation's business activities: {{ corporation_business_activities }} Parent:{{ parent|html }} Subsidiary:{{ subsidiary|html }} Was the corporation funded in whole or in part by any domestic or foreign government institution in the last completed financial year, or does the client expect funding in the current financial year?{{ is_government_funded }} B. Lobbyists Employed by the Corporation List of Senior Officers whose lobbying activities represent less than 20% of their Duties {* Name:{{ [lobbyists].name }} Position title:{{ [lobbyists].title }} Public offices held:{{ [lobbyists].public_offices_held }} Designated public office holder:{{ [lobbyists].is_public_officer }}Name *}{* Name:{{ [lobbyists].name }} Position title:{{ [lobbyists].title }} Public offices held:{{ [lobbyists].public_offices_held }} Designated public office holder:{{ [lobbyists].is_public_officer }} *} C. Lobbying Activity Information Federal departments or organizations which have been or will be communicated with during the course of the undertaking: {{ agencies_talked_to }} Communication techniques that have been used or are expected to be used in the course of the undertaking: {{ lobbying_activities }} Information about Subject matter:{{ lobbying_subject_matter }} Details Regarding the Identified Subject Matter """ subject_matter_pattern = """Details Regarding the Identified Subject Matter {* <tr><td>{{ [topics].category }}</td><td>{{ [topics].description }}</td></tr> *} """ page = GET(url) registration = scrape(pattern, html=html.tostring(html.fromstring(page), encoding='utf-8', method='text')) registration['lobbyists'] = [ l for l in registration['lobbyists'] if len(l['is_public_officer'].split()) == 1 ] registration['topics'] = scrape(subject_matter_pattern, html=page) registration['parent'] = registration['parent'].strip() registration['parent_name'] = registration['parent'].split('\n')[0] registration['subsidiary'] = registration['subsidiary'].strip() registration['subsidiary_name'] = registration['subsidiary'].split('\n')[0]
def main(): movements = scrape(MOVEMENTS_INDEX, html=get_page(URL)) print movements for m in movements['movements']: if 'artcyclopedia.com' in m['link']: movement = scrape(MOVEMENTS_INDIVIDUAL, html=get_page(m['link'])) print m['title'] if not movement: movement = scrape(MOVEMENTS_INDIVIDUAL2, html=get_page(m['link'])) relations = [] for relation in movement['related']: r = dict(movement=m['title'], related_to=relation['topic']) if '/artists/' in relation['link']: r['topic'] = 'artist' else: r['topic'] = 'movement' relations.append(r) artists = [] for artist in movement['artists']: artist['movement'] = m['title'] dates = artist['alive'].split('-') try: artist['birth_year'] = int(dates[0]) artist['death_year'] = int(dates[1]) except ValueError: if 'Born' in dates: artist['birth_year'] = int(dates.split()[1]) artist['death_year'] = None except: print >> sys.stderr, "ERROR: Can't parse dates for %s: %s" % ( artist['name'], artist['alive']) artist['birth_year'] = None artist['death_year'] = None artist['profile_link'] = URL + artist['profile_link'][3:] try: artist['nationality'], artist['profession'] = artist[ 'artist_type'].split(' ', 1) except ValueError: artist['nationality'] = artist['artist_type'] artist['profession'] = 'unknown' artists.append(artist) datastore.save(['name'], table_name="movements", data=dict(name=m['title'], link=m['link'])) datastore.save(['movement', 'related_to'], table_name="relations", data=relations) datastore.save(['name', 'nationality'], table_name="artists", data=artists)
def corporation_registration(page): pattern = """Corporation:{{ name }}Name change history Responsible Officer:{{ responsible_officer_name }} Position Title: {{ responsible_officer_name }} Version:{{ registration_id }} Type:{{ registration_type }} Active from:{{ registration_active_from_date }} Activity last confirmed:{{ registration_last_confirmed_date }} A. Information about Responsible Officer and Corporation Corporation:{{ corporation_name }} Telephone number:{{ corporation_phone }} Fax number:{{ corporation_fax }} Description of the corporation's business activities: {{ corporation_business_activities }} Parent:{{ parent|html }} Subsidiary:{{ subsidiary|html }} Was the corporation funded in whole or in part by any domestic or foreign government institution in the last completed financial year, or does the client expect funding in the current financial year?{{ is_government_funded }} B. Lobbyists Employed by the Corporation List of Senior Officers whose lobbying activities represent less than 20% of their Duties {* Name:{{ [lobbyists].name }} Position title:{{ [lobbyists].title }} Public offices held:{{ [lobbyists].public_offices_held }} Designated public office holder:{{ [lobbyists].is_public_officer }}Name *}{* Name:{{ [lobbyists].name }} Position title:{{ [lobbyists].title }} Public offices held:{{ [lobbyists].public_offices_held }} Designated public office holder:{{ [lobbyists].is_public_officer }} *} C. Lobbying Activity Information Federal departments or organizations which have been or will be communicated with during the course of the undertaking: {{ agencies_talked_to }} Communication techniques that have been used or are expected to be used in the course of the undertaking: {{ lobbying_activities }} Information about Subject matter:{{ lobbying_subject_matter }} Details Regarding the Identified Subject Matter """ subject_matter_pattern = """Details Regarding the Identified Subject Matter {* <tr><td>{{ [topics].category }}</td><td>{{ [topics].description }}</td></tr> *} """ page = GET(url) registration = scrape(pattern, html=html.tostring(html.fromstring(page), encoding='utf-8', method='text')) registration['lobbyists'] = [l for l in registration['lobbyists'] if len(l['is_public_officer'].split()) == 1] registration['topics'] = scrape(subject_matter_pattern, html=page) registration['parent'] = registration['parent'].strip() registration['parent_name'] = registration['parent'].split('\n')[0] registration['subsidiary'] = registration['subsidiary'].strip() registration['subsidiary_name'] = registration['subsidiary'].split('\n')[0]
def iter_mailing_list_quake_refs(): base = "http://lists.geonet.org.nz/pipermail/eqnews/" index_urls = scrape(PIPERMAIL_INDEX_PATTERN, url=base) index_urls.reverse() index_urls= index_urls[96:] ### REMEMBER TO DELETE WHEN FIRST COMPELETE RUN WORKS print index_urls for month in index_urls: print month messages = scrape(PIPERMAIL_MONTH_PATTERN, url = base + month) messages = [base + month.replace('date.html', link) for link in messages if '0' in link] print messages for message in messages: yield scrape(PIPERMAIL_MESSAGE_PATTERN, url=message)
def get(self): # page ALEC_Corporations html = urllib2.urlopen("http://www.sourcewatch.org/index.php?title=ALEC_Corporations").read() # get for-profit corporation citations references = scrape("""<ol class="references"> {* <li> {{ []|html }} </li> *} </ol>""",html) self._add_citations(references,'ALEC_Corporations') # get for-profit corporations letters = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z'] corporations = [] for letter in letters: corps = scrape("""<h3> <span class="mw-headline">"""+letter+""" </span> </h3> <ul> {* <li> {{ []|html }} </li> *} </ul> """,html) corpdata = [] for each in corps: refs = scrape(""" {* <sup class="reference">[ {{ []|int }} ]</sup> *}""",each) datas = scrape(""" {* {{ [] }} <sup class="reference"> </sup> *} """,each) name = '' if len(datas) > 0: name = datas[0] info = '' if len(datas) > 2: for d in datas[0:]: info = info+d elif len(datas) > 1: info = datas[1] corpdata.append((name,info,refs)) corporations.extend(corpdata) self._add_corporations(corporations,1) # page ALEC_Non-Profits html2 = urllib2.urlopen("http://www.sourcewatch.org/index.php?title=ALEC_Non-Profits").read() # get non-profit corporation citations references = scrape("""<ol class="references"> {* <li> {{ []|html }} </li> *} </ol>""",html2) self._add_citations(npcitations,'ALEC_Non-Profits') # get non-profit corporations nonprofits = [] for letter in letters: np = scrape("""<h3> <span class="mw-headline">"""+letter+"""</span> </h3> <ul> {* <li> {{ []|html }} </li> *} </ul> """,html2) npdata = [] for each in np: refs = scrape(""" {* <sup class="reference">[ {{ []|int }} ]</sup> *}""",each) datas = scrape(""" {* {{ [] }} <sup class="reference"> </sup> *} """,each) name = '' if len(datas) > 0: name = datas[0] info = '' if len(datas) > 2: for d in datas[0:]: info = info+d elif len(datas) > 1: info = datas[1] npdata.append((name,info,refs)) nonprofits.extend(npdata) self._add_corporations(nonprofits,0)
def StartUp(): #go to homepage in order to set session cookie start_url = "https://delecorp.delaware.gov/tin/GINameSearch.jsp" p = "" g = {"x": str(time())} html = "" uastart = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; rv:9.0) Gecko/20100101 Firefox/9.0" } try: html = scrape(''' <html> {{ [y].html }} </html> ''', url=start_url, get=g, headers=uastart, cookie_jar=myjar) except BadStatusLine: #hmmm... will skip this check for now.. return 0 except Exception, e: debug(repr(e)) debug("scrape problem") return 1
def getbdschools(url): return scrapemark.scrape(""" {* <table> <tr></tr> <tr></tr> {* <tr> <td><a href='{@{*<div class="contactinfonew"></div><div id="heading3"></div> <div class="addressbar contactinfonew"> <div><div>{{[saddress]}}</div> <div>tel:{{[sphone]}}</div> </div> <div> <div><a></a><a href={{[sweb]}} target="new"></a></div> <div><a></a></div> </div> <div><a></a><a></a></div> <div><br>{{[sbrief]}}</div> </div>*} @}'>{{[sname]}}</a></td> <td><div>{{[stype]}}</div></td> <td><div>{{[sgrade]}}</div></td> <td><div>{{[scity]}}</div></td> </tr> *} </table> *} """, url=url)
def get_session_attendants(id): """ Get list of people who have attended a session """ global db url = BASEURL + 'to0045.asp?__ctext=0&__ksinr=' + str(id) print "Lade Anwesenheitsliste", url html = urllib2.urlopen(url).read() data = scrape(""" {* <tr> <td><a href="kp0050.asp?__kpenr={{ [attendee].id|int }}&grnr={{ [attendee].grnr|int }}">{{ [attendee].name }}</a></td> <td>{{ [attendee].organization }}</td> <td>{{ [attendee].function }}</td> </tr> *} """, html) persons = [] attendants = [] for row in data['attendee']: persons.append({ 'person_id': row['id'], 'person_name': row['name'], 'person_organization': row['organization'] }) attendants.append({ 'session_id': id, 'person_id': row['id'], 'attendance_function': row['function'] }) db.save_rows('people', persons, ['person_id']) db.save_rows('attendance', attendants, ['session_id', 'person_id'])
def get_session_attendants(id): """ Scrapet die Liste der (eingeladenen) Teilnehmer einer Sitzung """ global db url = config.BASEURL + (config.URI_ATTENDANTS % id) print "Lade Anwesenheitsliste", url html = urllib2.urlopen(url).read() data = scrape(""" {* <tr> <td><a href="kp0050.asp?__kpenr={{ [attendee].id|int }}&grnr={{ [attendee].grnr|int }}">{{ [attendee].name }}</a></td> <td>{{ [attendee].organization }}</td> <td>{{ [attendee].function }}</td> </tr> *} """, html) persons = [] attendants = [] for row in data['attendee']: persons.append({ 'person_id': row['id'], 'person_name': row['name'], 'person_organization': row['organization'] }) attendants.append({ 'session_id': id, 'person_id': row['id'], 'attendance_function': row['function'] }) if not options.simulate: db.save_rows('people', persons, ['person_id']) db.save_rows('attendance', attendants, ['session_id', 'person_id'])
def GetPage(fileid): debug("GetPage:fileid: " + str(fileid)) #search for a known company: params = { "JSPName": "GINAMESEARCH", "action": "Search", "frmFileNumber": fileid, "frmEntityName": "" } html = "" try: html = scrape(''' <html> {{ [y].html }} </html> ''', url=base_url, post=params, headers=ua, cookie_jar=myjar) except Exception, e: debug(repr(e)) debug("scrape problem") return 1
def GetPage(fileid): try: terms = (scrape(""" {* <h2>Full Details</h2> </div> <div class='page_summary_3col'></div> <div class='page_content_3col'><table width='60%'><tr><td colspan='2' class='line'><font size='2'><b>English</b></font></td></tr><tr><td class='line'><font size='2'>Term</font></td><td class='line'><font size='2'>{{ [y].en_term }}</font></td></tr><tr><td class='line'><font size='2'>Definition</font></td><td class='line'><font size='2'>{{ [y].en_definition }}</font></td></tr><tr><td class='line'><font size='2'>Context</font></td><td class='line'><font size='2'>{{ [y].en_context }}</font></td></tr></table><br><table width='60%'><tr><td colspan='2' class='line'><font size='2'><b>Welsh</b></font></td></tr><tr><td class='line'><font size='2'>Term</font></td><td class='line'><font size='2'>{{ [y].cy_term }}</font></td></tr><tr><td class='line'><font size='2'>Definition</font></td><td class='line'><font size='2'>{{ [y].cy_definition }}</font></td></tr><tr><td class='line'><font size='2'>Status</font></td><td class='line'><font size='2'>{{ [y].cy_status }}</font></td></tr><tr><td class='line'><font size='2'>Part of Speech</font></td><td class='line'><font size='2'>{{ [y].cy_part_of_speech }}</font></td></tr><tr><td class='line'><font size='2'>Gender</font></td><td class='line'><font size='2'>{{ [y].cy_gender }}</font></td></tr><tr><td class='line'><font size='2'>Number</font></td><td class='line'><font size='2'>{{ [y].cy_number }}</font></td></tr><tr><td class='line'><font size='2'>Context</font></td><td class='line'><font size='2'>{{ [y].cy_context }}</font></td></tr><tr><td class='line'><font size='2'>Subject : </font></td><td class='line'><font size='2'>{{ [y].cy_subject }}</font></td></tr></table></div></div></div> *} """, url=base_url + fileid)) debug((len(terms['y']), "items found")) debug(terms['y']) for k in terms['y']: k['id'] = fileid scraperwiki.sqlite.execute(""" INSERT OR REPLACE INTO swdata (id, en_term, en_definition, en_context, cy_term, cy_definition, cy_status, cy_part_of_Speech, cy_gender, cy_number, cy_context, cy_subject) values (:id, :en_term, :en_definition, :en_context, :cy_term, :cy_definition, :cy_status, :cy_part_of_speech, :cy_gender, :cy_number, :cy_context, :cy_subject) """, k, verbose=0) scraperwiki.sqlite.commit() #scraperwiki.sqlite.save(unique_keys=fileid, data=k, table_name="swdata") except Exception, e: print e return
def process(): url='http://www.tianya.cn/publicforum/content/develop/1/905898.shtml' template=Template(u""" {* <table id="firstAuthor"> <tr> <td> <a>${author}</a> 发表日期:{{ [stanzas].datetime }} </td> </tr> </table> <div id="pContentDiv"> <div class="post"> {{ [stanzas].content }} </div> </div> *} {* <table> <tr> <td> <a>${author}</a> 回复日期:{{ [stanzas].datetime }} </td> </tr> </table> <div class="post"> {{ [stanzas].content }} </div> *} """) pattern=template.substitute(author=u'flp713') pattern=scrapemark.compile(pattern) stanzas=scrapemark.scrape(pattern, url=url, encoding=encoding)['stanzas'] return stanzas
def pagefetch(p_url, debug=False): html = urllib2.urlopen(p_url).read() results = scrapemark.scrape( """{* <div id="srp"> <ul id="results"> {* <li> <a><img alt="" src={{[thumbs]}}/> </a> <div class="result-info"> <h3><a href="speaker.php?{{[links]}}">{{[names]}}</a></h3> </div> </li> *}</ul> <p class="pagination"> <a href="results.php?{{[nxurl]}}">Next</a></p> </div> *}""", html) if debug: print "Fetched Names:", len(results['names']) print "Fetched Relinks:", len(results['links']) print "Current Page:", p_url print "Next Page:", results['nxurl'] return results else: return results
def post(self): logging.debug('ItemHandler.post') url = self.request.get('url') detail = scrapemark.scrape(""" {* <tr><td><font>{{ name }}</font></td></tr> *} {* <tr><th>Specialty</th><td>{{ specialty }}</td></tr> *} {* <tr><th>Facility</th><td>{{ facility }}</td></tr> *} {* <tr><th>Address</th><td>{{ address|html }}</td></tr> *} {* <tr><th>Phone</th><td>{{ phone }}</td></tr> *} {* <tr><th>Certification</th><td>{{ certification }}</td></tr> *} {* <tr><th>Medical School</th><td>{{ school }}</td></tr> *} {* <tr><th>Residency</th><td>{{ residence }}</td></tr> *} {* <tr><th>Gender</th><td>{{ gender }}</td></tr> *} """, url=url) address = detail['address'].replace('<br>', '\n').replace( '\t', '').replace('\r', '').replace('\n\n', '\n') office = models.Office.getOrCreate(detail['facility'], address, detail['phone']) detail['specialties'] = [ i.strip() for i in detail['specialty'].split(';') ] doc = models.Doc(**detail) doc.office = office doc.put()
def getIO(name, urlz): ios = scrapemark.scrape(""" {* <td><font>{{ [io].direction }}</font></td> <td><font>{{ [io].ft }}</font></td> <td><font>{{ [io].substance }}</font></td> <td>{{ [io].value }}</font></td> <td>{{ [io].min }}</td> <td>{{ [io].max }}</td> <td>{{ [io].std }}</td> <td><font>{{ [io].unit }}</font></td> <td><font>{{ [io].environment }}</font></td> <td><font>{{ [io].geo }}</font></td> </tr> *} """, url=urlz) inventorystr = "" for flow in ios['io']: if flow['direction'] == "Input" or flow['direction'] == "Output": inventorystr = inventorystr + "<eco:hasUnallocatedExchange>"; inventorystr = inventorystr + '<eco:hasEffect><rdfs:type rdf:resource="eco:' + flow['direction'] + '" /><eco:hasTransferable><eco:Substance><rdfs:label>' + flow['substance'] + '</rdfs:label></eco:Substance></eco:hasTransferable></eco:hasEffect>' inventorystr = inventorystr + "<eco:hasQuantity><eco:hasUnitOfMeasure>" + flow["unit"] + "</eco:hasUnitOfMeasure><eco:hasMagnitude>" + flow["value"] + "</eco:hasMagnitude><ecoUD:maxValue>" + flow["max"] + "</ecoUD:maxValue><ecoUD:minValue>" + flow["min"] + "</ecoUD:minValue><ecoUD:maxValue>" + flow["max"] + "</ecoUD:maxValue><ecoUD:ecoUD:standardDeviation95>" + flow["std"] + "</ecoUS:ecoUD:standardDeviation95></eco:hasQuantity>"; inventorystr = inventorystr + '</eco:hasUnallocatedExchange>'; return inventorystr
def parse_section(section): section_data = None for section_pattern in section_patterns: test_section_data = scrape(section_pattern, section) if test_section_data is not None: section_data = test_section_data if section_data is None: #print section return {} #return section recheck = False try: section_data['start'] except KeyError: pass else: if ' to ' in section_data['start']: section_data['start'], section_data['end'] = section_data['start'].split(' to ') #TODO section_patterns: Fix the patterns above to avoid doing this hack if 'end' in section_data and \ section_data['start'].lower().endswith('san') and \ section_data['end'].lower().startswith('lan to '): section_data['start'] = 'Santolan' section_data['end'] = section_data['end'].lower().replace('lan to ', '') if isinstance(section_data['stat'], list): section_data['stat'] = '-'.join(section_data['stat']) is_saved = False if 'stat' not in section_data: #print section return {} #return section return section_data
def fetchresultpage(sessionid,pagenumber,county): try: result = scrapemark.scrape(""" <tr nowrap="" align="left" valign="top"></tr> {* <tr align='left'> {* <td align='center'></td> <td align='center'></td> <td> <a href="javascript: OpenDetail('{{ [offenders].uniqueid }}')"> {{ [offenders].name }} </a> </td> {# <td> {{ [offenders].address }} </td> <td>{{ [offenders].city }}</td> <td align='center'>{{ [offenders].zip }}</td> <td>{{ [offenders].county }}</td> #} *} </tr> *} """, url='http://www.meganslaw.ca.gov/cgi/prosoma.dll?w6='+sessionid+'&searchby=CountyList&SelectCounty='+county+'&SB=0&PageNo='+str(pagenumber)) except: return "Error" return result
def harvest(): squeezed=memcache.get('Squeezed::lemons') if squeezed is None: squeezed=Squeezed.get_by_key_name('squeezed') if squeezed is None: fresh=(baseurl+lemon['url'] for lemon in lemons()) else: fresh=(baseurl+lemon['url'] for lemon in lemons() if lemon['url'] not in squeezed.lemons) else: fresh=(baseurl+lemon['url'] for lemon in lemons() if lemon['url'] not in squeezed.lemons) bucket=[] for lemon in fresh: logging.info('squeezing '+lemon) juices = scrapemark.scrape(""" <span class='tpc_title'></span> {* <img src='{{ [juices].image }}' border=0> <a href='{{ [juices].download }}' target=_blank></a> *} """, url=lemon)['juices'] logging.info(juices) for juice in juices: try: juice=Juice(key_name=lemon, image=juice['image'], download=juice['download']) juice.put() except BadValueError: logging.info(juice) bucket.append(lemon) if squeezed is None: squeezed=Squeezed(key_name='squeezed', lemons=bucket) else: squeezed.lemons.extend(bucket) squeezed.put() memcache.set('Squeezed::lemons', squeezed)
def process(thread): for url in pages(thread): stanza_template=Template(u""" {* <table id="firstAuthor"> <tr> <td> <a>${author}</a> 发表日期:{{ [stanzas].datetime }} </td> </tr> </table> <div id="pContentDiv"> <div class="post"> {{ [stanzas].content|html }} </div> </div> *} {* <table> <tr> <td> <a>${author}</a> 回复日期:{{ [stanzas].datetime }} </td> </tr> </table> <div class="post"> {{ [stanzas].content|html }} </div> *} """) logging.info(thread['author']) pattern=scrapemark.compile(stanza_template.substitute(author=thread['author'])) logging.info(pattern) thread['stanzas'][url]=scrapemark.scrape(pattern, url=url, encoding=encoding)['stanzas'] logging.info(thread['stanzas'][url])
def getbdschools(url): return scrapemark.scrape(""" {* <table> <tr></tr> <tr></tr> {* <tr> <td><a href='{@{*<div class="contactinfonew"></div><div id="heading3"></div> <div class="addressbar contactinfonew"> <div><div>{{[saddress]}}</div> <div>tel:{{[sphone]}}</div> </div> <div> <div><a></a><a href={{[sweb]}} target="new"></a></div> <div><a></a></div> </div> <div><a></a><a></a></div> <div><br>{{[sbrief]}}</div> </div>*} @}'>{{[sname]}}</a></td> <td><div>{{[stype]}}</div></td> <td><div>{{[sgrade]}}</div></td> <td><div>{{[scity]}}</div></td> </tr> *} </table> *} """,url=url)
def getIO(name, urlz): #print url #html = scraperwiki.scrape(url) #soup = BeautifulSoup(html) ios = scrapemark.scrape(""" {* <td><font>{{ [io].direction }}</font></td> <td><font>{{ [io].ft }}</font></td> <td><font>{{ [io].substance }}</font></td> <td>{{ [io].value }}</font></td> <td>{{ [io].min }}</td> <td>{{ [io].max }}</td> <td>{{ [io].std }}</td> <td><font>{{ [io].unit }}</font></td> <td><font>{{ [io].environment }}</font></td> <td><font>{{ [io].geo }}</font></td> </tr> *} """, url=urlz) for flow in ios['io']: if flow['direction'] == "Input" or flow['direction'] == "Output": scraperwiki.sqlite.execute("insert into SPINEIO values (?,?,?,?,?,?,?,?,?,?,?)", (name,flow['direction'],flow['ft'],flow['substance'],flow['value'],flow['min'],flow['max'],flow['std'],flow['unit'],flow['environment'],flow['geo'])) scraperwiki.sqlite.commit()
def main(): # Fetch last page index last_page = scraperwiki.sqlite.get_var('last_page', default=0) #Scrape initial list p = scrape(PAGE_LIST_PATTERN, url=LIST_URL) # print p print 'starting from ' + str(last_page) # if last_page == 0: print 'first page? ' # Scrape the first list page scrape_list(LIST_URL) # slice from last index p = p[last_page:] # print p # Scrape each list page for page in p: # print 'scraping page : ' + str(page) url = "%s&intPageNumber=%d" % (LIST_URL, page) # print url scrape_list(url) # save page index scraperwiki.sqlite.save_var('last_page', page-1) # reset page index to 0 scraperwiki.sqlite.save_var('last_page', 0)
def parse_entry(entry): updated_at = entry.updated_at # Add 8 hours to consider Asia/Manila timezone #updated_at = updated_at + datetime.timedelta(0, 8 * 60 * 60) now = datetime.datetime.now() if updated_at.day > now.day: updated_at = updated_at - datetime.timedelta(1) text = entry.text text = re.sub('%s[, ]?' % entry.road.name, '', text, flags=re.IGNORECASE) text = re.sub('http://twitpic.com/[A-Za-z0-9] ?', '', text, flags=re.IGNORECASE) data = None # Figure out if the data would make sense. for main_pattern in main_patterns: test_data = scrape(main_pattern, text) if test_data is not None: data = test_data break if data is None: return # Get the time #print entry.road, updated_at.strftime('%d-%H:%M'), stat_time = data.get('time', None) if stat_time: if 'pm' in stat_time.lower(): add12 = True else: add12 = False try: stat_time = datetime.datetime.strptime(stat_time.replace(' ', ''), '%H:%M%p') except KeyError, e: stat_time = updated_at except ValueError, e: #print stat_time.replace(' ', ''), e stat_time = updated_at
def parse_list(resp): html = BeautifulSoup(resp.body).prettify() members = scrape( """{* <tr> <td> <a href='{{ [res].idlink }}'>{{ [res].name }}</a> {* <strong>({{ [res].ref }})</strong> *} </td> <td> <font>partido {{ [res].party }}</font> </td> </tr> *}""", html=html)['res'] # TODO: The president of the chamber may appear only in a footer. Add him # to the members list. sel = HtmlXPathSelector(resp) trs = sel.select('//tr/td[@align="RIGHT" and @valign="TOP" and @width="5%"]/font/strong/../../..') refs = {} for tr in trs: ref = tr.select('.//strong[starts-with(text(), "(")]/text()')[0].extract()[1:-1] sub_info = "".join(tr.select('.//td[2]/font/descendant-or-self::*/text()').extract()) refs[ref] = sub_info items = [] for info in members: #since = None #to = None why = None #substitutes = None if 'ref' in info and info['ref'] is not None: why = refs[info['ref']] #substitutes = sub_info['name'] #range = get_substitution_range(sub_info['why']) #why = get_substitution_reason(sub_info['why']) #if len(range) > 0: # since = range[0] #if len(range) > 1: # to = range[1] date = resp.meta['date'] id = extract_id_link(info['idlink']) + date.strftime(DATE_FMT) items.append(SubstitutesItem(id=id, date=date, name=info['name'], party=info['party'], chamber=resp.url[-1], #substitutes=substitutes, #substitutes_from=since, #substitutes_to=to, substitutes_line=why)) return items
def scrapeEpisodes(url): return scrapemark.scrape(""" {* <td class="summary">"<b>{{ [episode].name }}</b>"</td> <span class="bday dtstart published updated">{{ [episode].date }}</span> *} """, url=url)
def iter_mailing_list_quake_refs(): base = "http://lists.geonet.org.nz/pipermail/eqnews/" index_urls = scrape(PIPERMAIL_INDEX_PATTERN, url=base) index_urls.reverse() index_urls = index_urls[ 96:] ### REMEMBER TO DELETE WHEN FIRST COMPELETE RUN WORKS print index_urls for month in index_urls: print month messages = scrape(PIPERMAIL_MONTH_PATTERN, url=base + month) messages = [ base + month.replace('date.html', link) for link in messages if '0' in link ] print messages for message in messages: yield scrape(PIPERMAIL_MESSAGE_PATTERN, url=message)
def get_values_for_station_and_day(station, date): datestring = date.strftime('%d.%m.%Y') now = datetime.today() url = 'http://luadb.lds.nrw.de/LUA/wiski/pegel.php?stationsname_n=' + station + '&meindatum=' + datestring + '&tabellet=Tabelle' br = mechanize.Browser() br.set_handle_robots(False) br.open(url) assert br.viewing_html() data = scrapemark.scrape( """ {* <td class='messwerte'>{{ [values].datetime }}</td> <td class='messwerte'>{{ [values].value|float }} </td> *} """, br.response().read()) if 'values' in data: datasets = [] #print data['values'] for row in data['values']: #print station, row['datetime'], ("%.2f" % row['value']) # datetime string can be "DD.MM HH:MM" or "HH:MM" match1 = re.match( r"([0-9]{2})\.([0-9]{2})\s+([0-9]{2}):([0-9]{2})", row['datetime']) match2 = re.match(r"([0-9]{2}):([0-9]{2})", row['datetime']) year = None if match1 is not None: day = match1.group(1) month = match1.group(2) year = now.year hour = match1.group(3) minute = match1.group(4) if now.day == 1 and now.month == 1 and day == 31 and month == 12: year = year - 1 elif match2 is not None: day = date.day month = date.month year = date.year hour = match2.group(1) minute = match2.group(2) if year is not None: mez_timestamp = int( datetime(int(year), int(month), int(day), int(hour), int(minute)).strftime('%s')) utc_timestamp = mez_timestamp - 3600 utcdate = datetime.fromtimestamp(utc_timestamp) datasets.append({ 'station': station, 'datetime_utc': utcdate.strftime('%Y-%m-%d %H:%S'), 'value': ("%.2f" % row['value']) }) scraperwiki.sqlite.save(unique_keys=['datetime_utc', 'station'], data=datasets, table_name="raindata") return len(datasets)
def main(): movements = scrape(MOVEMENTS_INDEX, html=get_page(URL)) print movements for m in movements["movements"]: if "artcyclopedia.com" in m["link"]: movement = scrape(MOVEMENTS_INDIVIDUAL, html=get_page(m["link"])) print m["title"] if not movement: movement = scrape(MOVEMENTS_INDIVIDUAL2, html=get_page(m["link"])) relations = [] for relation in movement["related"]: r = dict(movement=m["title"], related_to=relation["topic"]) if "/artists/" in relation["link"]: r["topic"] = "artist" else: r["topic"] = "movement" relations.append(r) artists = [] for artist in movement["artists"]: artist["movement"] = m["title"] dates = artist["alive"].split("-") try: artist["birth_year"] = int(dates[0]) artist["death_year"] = int(dates[1]) except ValueError: if "Born" in dates: artist["birth_year"] = int(dates.split()[1]) artist["death_year"] = None except: print >> sys.stderr, "ERROR: Can't parse dates for %s: %s" % (artist["name"], artist["alive"]) artist["birth_year"] = None artist["death_year"] = None artist["profile_link"] = URL + artist["profile_link"][3:] try: artist["nationality"], artist["profession"] = artist["artist_type"].split(" ", 1) except ValueError: artist["nationality"] = artist["artist_type"] artist["profession"] = "unknown" artists.append(artist) datastore.save(["name"], table_name="movements", data=dict(name=m["title"], link=m["link"])) datastore.save(["movement", "related_to"], table_name="relations", data=relations) datastore.save(["name", "nationality"], table_name="artists", data=artists)
def iter_recent_quakes(): for quake in scrape(pattern=RECENT_QUAKES_PATTERN,url=URL)['quakes']: quake['url'] = 'http://www.geonet.org.nz' + quake['url'] quake['shaking_map_url'] = 'http://www.geonet.org.nz' + quake['shaking_map_url'] quake['maps_url'] = 'http://www.geonet.org.nz' + quake['maps_url'] quake['img_of_quake_location_url'] = 'http://www.geonet.org.nz' + quake['img_of_quake_location_url'] for k, val in fetch_quake_data(quake['geonet_ref']).iteritems(): quake[k] = val yield quake
def scrape_list(url): #html = mech_scrape(url) p = scrape(EST_PATTERN, url=url) print p for e in p: est_url = "%s%s%d" % (BASE_URL, DETAIL_URL, e) print 'scraping: ' + est_url print 'scraping id: ' + str(e) scrape_detail(est_url, e)
def fetch_load_url(self): pattern = ''' <title>{{ pagetitle }}</title> ''' dict = scrapemark.scrape(pattern, url=self.url) self.html_title = dict['pagetitle'] self.fetched_url = True
def parse_list(resp): html = BeautifulSoup(resp.body).prettify() members = scrape( """{* <tr> <td> <a href='{{ [res].idlink }}'>{{ [res].name }}</a> {* <strong>({{ [res].ref }})</strong> *} </td> <td> <font>partido {{ [res].party }}</font> </td> </tr> *}""", html=html)['res'] # TODO: The president of the chamber may appear only in a footer. Add him # to the members list. sel = HtmlXPathSelector(resp) trs = sel.select('//tr/td[@align="RIGHT" and @valign="TOP" and @width="5%"]/font/strong/../../..') refs = {} for tr in trs: ref = tr.select('.//strong[starts-with(text(), "(")]/text()')[0].extract()[1:-1] refs[ref] = tr items = [] for info in members: since = None to = None line = None substitutes_name = None substitutes_oid = None if 'ref' in info and info['ref'] is not None: try: tr = refs[info['ref']] except KeyError: logger.warning('Couldnt find reference %s in substitutes table.' % \ info['ref'], exc_info=sys.exc_info()) line = "".join(tr.select('.//td[2]/font/descendant-or-self::*/text()').extract()) links = tr.select('.//a') if links: substitutes_oid = extract_id_link(links[0].select('.//@href').extract()[0])[2:] substitutes_name = links[0].select('.//text()').extract()[0] range = get_substitution_range(line) if len(range) > 0: try: since = datetime.strptime(range[0], PAGE_DATE_FMT).date() except ValueError, e: logger.warning("Unable to parse substitute 'since' date", exc_info=sys.exc_info()) if len(range) > 1: try: to = datetime.strptime(range[1], PAGE_DATE_FMT).date() except ValueError, e: logger.warning("Unable to parse substitute 'to' date", exc_info=sys.exc_info())
def scrape_detail(est_url, id): html = scraperwiki.scrape(est_url) est_details = scrape(DETAIL_PATTERN, html) if not est_details: #Try the exempt pattern est_details = scrape(EXEMPT_PATTERN, html) if not est_details: # it's either changed hands and will turn up soon, or it's new return else: # print est_details['inspection_date'] est_details['inspection_date'] = datetime.strftime(datetime.strptime(est_details['inspection_date'], '%d/%m/%Y'), '%Y-%m-%d') # parser.parse(est_details['inspection_date']) # print est_details['inspection_date'] # Locate # Attempt to find sql = 'lat, lng FROM swdata WHERE address = "%s" AND lat IS NOT NULL LIMIT 0,1' % est_details['address'] latlng = scraperwiki.sqlite.select(sql) #Avoid multiple google lookups if latlng: # print 'DB Geo' # print latlng est_details['lat'] = latlng[0]['lat'] est_details['lng'] = latlng[0]['lng'] # print est_details['lat'] else: # print 'Goog lookup' location = locate(est_details['address'] + ', Auckland, NZ') if location: est_details['lat'], est_details['lng'] = location #est_details['fg_id'] = id # Gah! id aint unique?? #est_details['url'] = est_url # URLs are useless - the IDs float!!?? WTF!? # Save scraperwiki.sqlite.save(unique_keys=['name','address','grade','inspection_date'], data=est_details) print 'saved'
def getEachRecord(name, urlz): #print url #html = scraperwiki.scrape(url) #soup = BeautifulSoup(html) #date = soup.find(text="Date Completed").parent.parent.parent.nextSibling.nextSibling.text #print date inventory = {} temp = scrapemark.scrape(""" {* <th align="left" valign="top" width="257" bgcolor="#DCDEE0"><font style="font-size: 11px;" face="Verdana"><em>Date Completed</em></font></th> <td valign="top" width="477" bgcolor="#FFFFFF"><font style="font-size: 11px;">{{ date }}</font> *} """, url=urlz) inventory['date'] = temp['date'] temp = scrapemark.scrape(""" {* <th align="left" valign="top" width="257" bgcolor="#DCDEE0"><font style="font-size: 11px;" face="Verdana"><em>Copyright</em></font></th> <td valign="top" width="477" bgcolor="#FFFFFF"><font style="font-size: 11px;">{{ copyright }}</font> *} """, url=urlz) inventory['copyright'] = temp['copyright'] temp = scrapemark.scrape(""" {* <th align="left" valign="top" width="257" bgcolor="#DCDEE0"><font style="font-size: 11px;"><em>Process Type</em></font></th> <td valign="top" width="477" bgcolor="#FFFFFF"><font style="font-size: 11px;">{{ desc }}</font> *} """, url=urlz) inventory['description'] = temp['desc'] temp = scrapemark.scrape(""" {* <th align="left" valign="top" width="257" bgcolor="#DCDEE0"><font style="font-size: 11px;"><em>Function</em></font></th> <td valign="top" width="477" bgcolor="#FFFFFF"><font style="font-size: 11px;">{{ desc }}</font> *} """, url=urlz) inventory['description'] = inventory['description'] + ". " + temp['desc'] scraperwiki.sqlite.execute("insert into SPINE values (?,?,?,?)", (name,inventory['date'],inventory['description'],inventory['copyright'])) scraperwiki.sqlite.commit()
def get_csv_link(): """ Return up-to-date csv link """ csv_pattern = """{*<a href="{{ href }}" title="Click here to download this file." target="blank"> ECE Directory .csv*}</a>""" det = scrape( csv_pattern, url= 'http://www.educationcounts.govt.nz/directories/early-childhood-services' ) return 'http://www.educationcounts.govt.nz%s' % det['href']
def swift_codes(queue=Q): print 'Getting countries' raw = GET(SWIFT_URL) print raw countries = scrape(COUNTRY_PATTERN, html=raw, headers=HEADERS)['countries'] print countries for country in countries: print country country['link'] = BASE_URL + country['link'] queue.push((parse_swift_code_page, country['link'], country['name']))
def get_values(): """Get actual readings from the stations""" baseurl = 'http://www.hlug.de/static/pegel/static/' listpageurl = baseurl + "list_N_0.htm?entryparakey=N" br = mechanize.Browser() br.set_handle_robots(False) br.open(listpageurl) assert br.viewing_html() links = [] for link in br.links(url_regex=".*stat_[0-9]+.htm\?entryparakey=N"): links.append(link.url) links = shuffle(links) for link in links: subpageurl = baseurl + link print "Fetching", subpageurl br.open(subpageurl) html = br.response().read() station = scrapemark.scrape(""" <table class="wwp_sdheader" cellpadding=4 width=720> <tr> <td class="wwp_sdheader" colspan=6>Station</td> </tr> <tr> <td class="head">Name</td><td class="td1">{{ name }}</td> <td class="head">Messstellen-Nr.</td><td class="td1">{{ id|int }}</td> <td class="head">Flussgebiet</td><td class="td1">{{ river }}</td> </tr> </table> <a target="_blank" class="graphlink" href="data_{{ linkid }}_N_WEEK.xls">4-Tage</a> """, html) #print station if station is not None and 'linkid' in station: excelurl = baseurl + 'data_'+ station['linkid'] +'_N_WEEK.xls' print excelurl book = xlrd.open_workbook(file_contents=urllib.urlopen(excelurl).read()) if book: sheet = book.sheets()[0] if sheet.ncols == 2 and sheet.nrows > 0: values = [] for rownumber in range(3, sheet.nrows): # skip first 3 rows (datecell, numcell) = [ sheet.cell(rownumber, j) for j in range(sheet.ncols) ] #print "%s, %.1f" % (datecell.value, numcell.value) match = re.match(r"([0-9]{2})\.([0-9]{2})\.([0-9]{4})\s([0-9]{2}:[0-9]{2})", datecell.value) if match is not None: values.append({ 'datetime': match.group(3) + '-' + match.group(2) + '-' + match.group(1) + ' ' + match.group(4), 'station_id': station['id'], 'rain_mm': ("%.1f" % numcell.value), }) #print values scraperwiki.sqlite.save(unique_keys=['datetime', 'station_id'], data=values, table_name="raindata") else: print "WARN: No workable data found."
def post(self): logging.debug('SyncHandler.post') scrape = scrapemark.scrape(""" {* <tr class='metalist'><td><a href='{{[details]}}'></a></td></tr> *} """, url=LIST_URL) for detailurl in scrape['details']: taskqueue.add(url='/tasks/item', params={'url': detailurl}) self.redirect('/')
def GetPage(fileid): try: fin = urllib2.urlopen(base_url + fileid) text = fin.read() fin.close() pprint(text) #test for no match no_match = (scrape(""" <hr>There are {{ }} that match your search criteria.<br> """, html=text)) print no_match #TODO: Save no match #if no_match == "no entries": #basic details: basic_details = (scrape(""" <span class=detailstext>Registration Number: {{ [y].reg_no }}</span><P><span class=detailstext>Date Registered: </span>{{ [y].reg_date }} <span class=detailstext>Registration Expires: </span>{{ [y].reg_expiry }}<br><br><span class=detailstext>Data Controller: </span>{{ [y].data_controller }}<P><div class=detailstext>Address:</div><Blockquote>{{ [y].reg_address|html }}</BlockQuote><hr> """, html=text)) print basic_details debug((len(basic_details['y']), "items found")) debug(basic_details['y']) #foi: foi = (scrape(""" <P ALIGN=center class=detailstext>{{ }} or a Scottish public authority """, html=text)) print foi #if foi == "Freedom of Information Act 2000": #<P class=detailstext>Other Names:</P><BlockQuote>FIRST MONEY DIRECT<br>FIRSTMONEYDIRECT.CO.UK<br></BlockQuote></BlockQuote><hr> except Exception, e: print e return
def parse_search_results(url, first=False): pattern = """{* <td>{{ [lobbyists]].type }}:<strong>{{ [lobbyists]].name }}</strong> {{ [lobbyists].lobbyist_details|html }}<a href="{{ [lobbyists].communication_reports_link|abs }}">View communication reports</a> </td> <td class="tableTop"> <a href="{{ [lobbyists].registration_link|abs }}> {{ [lobbyists].registration_begining }}to{{ [lobbyists].registration_ending }} </a> </td> *} {* <a href="{{ next|abs }}">Next</a> *} """ if first: res = scrape(pattern=pattern, url=url, post=params, cookie_jar=CJ) else: res = scrape(pattern=pattern, url=url, cookie_jar=CJ) print res lobbyists = res['lobbyists'] next_page_url = res['next'] print next_page_url for lobbyist in lobbyists: details = html.fromstring(lobbyist['lobbyist_details']) if lobbyist['type'] == u'Consultant': lobbyist['consulting_firm'] = details[1].text lobbyist['client'] = details[3].text lobbyist['lobbyist_id'] = details[4].tail.strip() elif lobbyist['type'] == u'In-house Organization' or lobbyist[ 'type'] == u'In-house Corporation': lobbyist['responsible_officer'] = ' '.join( part.strip() for part in details[1].text.split()) lobbyist['lobbyist_id'] = details[2].tail.strip() else: print 'CRAZINESS: new type found: ', lobbyist['type'], print lobbyist raise ValueError del lobbyist['lobbyist_details'] Q.put((comms_report_index, lobbyist['communication_reports_link'])) Q.put((registration, lobbyist['registration_link']))
def getDefinition(self, html): definition = scrape(""" <table border=0 cellspacing=10 cellpadding=0 width=100%> <tr> <td> {{ }} </td> </tr> </table> """, html) return definition
def iter_recent_quakes(): for quake in scrape(pattern=RECENT_QUAKES_PATTERN, url=URL)['quakes']: quake['url'] = 'http://www.geonet.org.nz' + quake['url'] quake['shaking_map_url'] = 'http://www.geonet.org.nz' + quake[ 'shaking_map_url'] quake['maps_url'] = 'http://www.geonet.org.nz' + quake['maps_url'] quake[ 'img_of_quake_location_url'] = 'http://www.geonet.org.nz' + quake[ 'img_of_quake_location_url'] for k, val in fetch_quake_data(quake['geonet_ref']).iteritems(): quake[k] = val yield quake
def fetchsession(): global fetchnumber global sessionid if (fetchnumber > 50) or (sessionid == ""): sessionurl = 'http://www.meganslaw.ca.gov/cgi/prosoma.dll?searchby=curno' result = scrapemark.scrape("{{ page.text }}", url=sessionurl) sessionid = str(result['page']['text']) if fetchnumber <= 50: fetchnumber += 1 else: fetchnumber = 0 return sessionid
def GetListOfLtt(): ltt = (scrape(""" <table> {* <td>{{ [y].ltt_id }} withdrawn</td> *} </table> """, url=base_url)) if ltt != None: if 'y' in ltt: debug((len(ltt['y']), "items found")) debug(ltt['y']) for k in ltt['y']: k['ltt_status'] = "WITHDRAWN" k['date_scraped'] = '' scraperwiki.sqlite.save(unique_keys=["ltt_id"], data=k, table_name="ltt_data") ltt = (scrape(""" <table> {* <td><a href='{{ [y].ltt_url|abs }}'>{{ [y].ltt_id }}</a></td> *} </table> """, url=base_url)) if ltt != None: if 'y' in ltt: debug((len(ltt['y']), "items found")) debug(ltt['y']) for k in ltt['y']: k['ltt_status'] = "ACTIVE" k['date_scraped'] = '' GetLtt(k['ltt_url'])