def crawl_web(seed, max_depth = 10, max_pages = 1000): crawled = set() crawl_queue = [] # priority queue ensures that more "shallow" links are handled first index = {} graph = {} counter = itertools.count() """ Add set of links to queue of sets, crawled_queue. Makes sure links is not in the set of already crawled urls. """ def add_links(links, depth = 0): count = next(counter) new_links = links.difference(crawled) entry = Links(priority = depth, id = count, links = new_links) heapq.heappush(crawl_queue, entry) """ Adds all of the words in page.content to the index of words to sets of urls """ def index_page(page): words = page.content.split() for word in words: if word in index: index[word.lower()].add(page.url) else: index[word.lower()] = {page.url} add_links({seed}, 0) pages = 0 while crawl_queue: entry = heapq.heappop(crawl_queue) to_crawl = entry.links depth = entry.priority while to_crawl and pages < max_pages: url = to_crawl.pop() page = Page(url) if page.is_valid() and not url in crawled: print url, depth pages += 1 crawled.add(url) index_page(page) graph[url] = page.outgoing_links if depth < max_depth: add_links(page.outgoing_links, depth + 1) return index, graph