def search(request):
    profiles = SpiderProfile.objects.filter(active=True)
    vars = {'profiles':profiles} 
    
    page_index = WhooshPageIndex()
    query = request.GET.get('q', None)
    if query:
        query = query.replace('+', ' AND ').replace(' -', ' NOT ') #this is fairly lame
        results = page_index.search(query, limit=100)
        vars['query'] = query
        vars['results'] = [ dict(title=r['title'], highlight=r.highlights('content'), url=r['url']) for r in results]
        vars['num_results'] =len(results)
        
    return render(request, 'search_results.html', vars)
def spider(profile, log=True):
    depth = profile.depth
    indexer = WhooshPageIndex()
    pending_urls = Queue.Queue()
    processed_responses = Queue.Queue()
    finished = threading.Event()
    
    visited = {}
    scheduled = set()
    
    thread_count = profile.threads or getattr(settings, 'SPIDER_THREADS', 4)
    
    threads = [SpiderThread(pending_urls, processed_responses, finished, profile) for x in range(thread_count)]
    
    pending_urls.put((profile.base_url, '', depth))
    scheduled.add(profile.base_url)
    
    extractor = profile.get_extractor_class()

    [t.start() for t in threads]
    processed_url = None
    try:
        while 1:
            try:
                # pull an item from the response queue
                result_dict, urls, depth = processed_responses.get(timeout=profile.timeout)
            except Queue.Empty:
                #check to see if any of the workers are still doing anything
                done = True
                for t in threads:
                    if t.working:
                        print "Thread %s is still working, not exiting" % t
                        done = False
                if done:
                    print "All threads done working"
                    finished.set()
                    break
            else:
                # save the result
                processed_url = result_dict['url']
                if log:
                    print "Adding page at url: %s, content length: %s to index" % (processed_url, len(result_dict['content']))
                
                raw_content = result_dict['content']
                e = extractor(raw_content)
                title = e.get_title()
                content = e.get_content()
                headings = e.get_headings()
                
                
                indexer.add_page(url=processed_url, title=title, content=content, site=profile.name, headings=headings)
                # remove from the list of scheduled items
                scheduled.remove(processed_url)
                
                # store response status in the visited dictionary
                visited[processed_url] = result_dict['response_status']
                
                # enqueue any urls that need to be checked
                if depth > 0:
                    for url in urls:
                        if url not in visited and url not in scheduled:
                            scheduled.add(url)
                            pending_urls.put((url, processed_url, depth - 1))
                
    except KeyboardInterrupt:
        pass
    except Exception, e:
        print "Got an exception while indexing page: %s\nWill exit." % processed_url
        print "Headings: %s" % headings
        raise