def crawl (start_url): """Recursively crawl starting from *start_url*. Returns a set of urls that were found.""" pool = evy.GreenPool() seen = set() fetch(start_url, seen, pool) pool.waitall() return seen
def launch_green_threads (): from evy.patched import socket import evy def green_accepter (server_sock, pool): for i in xrange(CONCURRENCY): sock, addr = server_sock.accept() pool.spawn_n(reader, sock) pool = evy.GreenPool(CONCURRENCY * 2 + 1) server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server_sock.bind(('localhost', 0)) server_sock.listen(50) addr = ('localhost', server_sock.getsockname()[1]) pool.spawn_n(green_accepter, server_sock, pool) for i in xrange(CONCURRENCY): pool.spawn_n(writer, addr, socket.socket) pool.waitall()
def producer(start_url): """Recursively crawl starting from *start_url*. Returns a set of urls that were found.""" pool = evy.GreenPool() seen = set() q = evy.Queue() q.put(start_url) # keep looping if there are new urls, or workers that may produce more urls while True: while not q.empty(): url = q.get() # limit requests to evy.net so we don't crash all over the internet if url not in seen and 'evy.net' in url: seen.add(url) pool.spawn_n(fetch, url, q) pool.waitall() if q.empty(): break return seen
"""A simple web server that accepts POSTS containing a list of feed urls, and returns the titles of those feeds. """ import evy feedparser = evy.import_patched('feedparser') # the pool provides a safety limit on our concurrency pool = evy.GreenPool() def fetch_title(url): d = feedparser.parse(url) return d.feed.get('title', '') def app(environ, start_response): if environ['REQUEST_METHOD'] != 'POST': start_response('403 Forbidden', []) return [] # the pile collects the result of a concurrent operation -- in this case, # the collection of feed titles pile = evy.GreenPile(pool) for line in environ['wsgi.input'].readlines(): url = line.strip() if url: pile.spawn(fetch_title, url) # since the pile is an iterator over the results, # you can use it in all sorts of great Pythonic ways titles = '\n'.join(pile)
#! /usr/bin/env python """ This is a simple web "crawler" that fetches a bunch of urls using a pool to control the number of outbound connections. It has as many simultaneously open connections as coroutines in the pool. The prints in the body of the fetch function are there to demonstrate that the requests are truly made in parallel. """ urls = ["http://www.google.com/intl/en_ALL/images/logo.gif", "https://wiki.secondlife.com/w/images/secondlife.jpg", "http://us.i1.yimg.com/us.yimg.com/i/ww/beta/y3.gif"] import evy from evy.patched import urllib2 def fetch (url): print "opening", url body = urllib2.urlopen(url).read() print "done with", url return url, body pool = evy.GreenPool(200) for url, body in pool.imap(fetch, urls): print "got body from", url, "of length", len(body)