def search(request): profiles = SpiderProfile.objects.filter(active=True) vars = {'profiles':profiles} page_index = WhooshPageIndex() query = request.GET.get('q', None) if query: query = query.replace('+', ' AND ').replace(' -', ' NOT ') #this is fairly lame results = page_index.search(query, limit=100) vars['query'] = query vars['results'] = [ dict(title=r['title'], highlight=r.highlights('content'), url=r['url']) for r in results] vars['num_results'] =len(results) return render(request, 'search_results.html', vars)
def spider(profile, log=True): depth = profile.depth indexer = WhooshPageIndex() pending_urls = Queue.Queue() processed_responses = Queue.Queue() finished = threading.Event() visited = {} scheduled = set() thread_count = profile.threads or getattr(settings, 'SPIDER_THREADS', 4) threads = [SpiderThread(pending_urls, processed_responses, finished, profile) for x in range(thread_count)] pending_urls.put((profile.base_url, '', depth)) scheduled.add(profile.base_url) extractor = profile.get_extractor_class() [t.start() for t in threads] processed_url = None try: while 1: try: # pull an item from the response queue result_dict, urls, depth = processed_responses.get(timeout=profile.timeout) except Queue.Empty: #check to see if any of the workers are still doing anything done = True for t in threads: if t.working: print "Thread %s is still working, not exiting" % t done = False if done: print "All threads done working" finished.set() break else: # save the result processed_url = result_dict['url'] if log: print "Adding page at url: %s, content length: %s to index" % (processed_url, len(result_dict['content'])) raw_content = result_dict['content'] e = extractor(raw_content) title = e.get_title() content = e.get_content() headings = e.get_headings() indexer.add_page(url=processed_url, title=title, content=content, site=profile.name, headings=headings) # remove from the list of scheduled items scheduled.remove(processed_url) # store response status in the visited dictionary visited[processed_url] = result_dict['response_status'] # enqueue any urls that need to be checked if depth > 0: for url in urls: if url not in visited and url not in scheduled: scheduled.add(url) pending_urls.put((url, processed_url, depth - 1)) except KeyboardInterrupt: pass except Exception, e: print "Got an exception while indexing page: %s\nWill exit." % processed_url print "Headings: %s" % headings raise