def search(request): vars = {} if request.GET.get('query'): #form was submitted page = int(request.GET.get('page', 1)) per_page = int(request.GET.get('per_page', 100)) search_form = SearchForm(request.GET) query = search_form['query'].value().replace('+', ' AND ').replace(' -', ' NOT ') profile_ids = search_form['profiles'].value() if profile_ids and len(profile_ids) != len(search_form['profiles'].field.choices): profiles = SpiderProfile.objects.filter(active=True).filter(pk__in=profile_ids) profile_names = [profile.name for profile in profiles] else: profile_names = None backend = get_backend() start_time = datetime.now() results = backend.search(query, pagenum=page, pagelen=per_page, sites=profile_names) query_time = datetime.now() - start_time query_time_formatted = float(query_time.seconds) + float(query_time.microseconds)/100000.0 vars['query_time'] = query_time_formatted vars['page'] = page vars['pagecount'] = results['pagecount'] vars['results'] = results['hits'] vars['num_results'] = results['total_hits'] vars['pagination_results'] = range(1, vars['num_results']+1) vars['start'] = (page-1)*per_page + 1 template = 'search_results.html' else: search_form = SearchForm() template = 'index.html' vars['search_form'] = search_form return render(request, template, vars)
def search(request): vars = {} if request.GET.get('query'): #form was submitted page = int(request.GET.get('page', 1)) per_page = int(request.GET.get('per_page', 100)) search_form = SearchForm(request.GET) query = search_form['query'].value().replace('+', ' AND ').replace( ' -', ' NOT ') profile_ids = search_form['profiles'].value() if profile_ids and len(profile_ids) != len( search_form['profiles'].field.choices): profiles = SpiderProfile.objects.filter(active=True).filter( pk__in=profile_ids) profile_names = [profile.name for profile in profiles] else: profile_names = None backend = get_backend() start_time = datetime.now() results = backend.search(query, pagenum=page, pagelen=per_page, sites=profile_names) query_time = datetime.now() - start_time query_time_formatted = float( query_time.seconds) + float(query_time.microseconds) / 100000.0 vars['query_time'] = query_time_formatted vars['page'] = page vars['pagecount'] = results['pagecount'] vars['results'] = results['hits'] vars['num_results'] = results['total_hits'] vars['pagination_results'] = range(1, vars['num_results'] + 1) vars['start'] = (page - 1) * per_page + 1 template = 'search_results.html' else: search_form = SearchForm() template = 'index.html' vars['search_form'] = search_form return render(request, template, vars)
def spider(profile, log=True, fast=False, single_thread=False): depth = profile.depth indexer = get_backend(fast=fast) indexer.create_index(profile.name) pending_urls = Queue.Queue() processed_responses = Queue.Queue(maxsize=50) finished = threading.Event() visited = {} scheduled = set() if not single_thread: thread_count = profile.threads or getattr(settings, 'SPIDER_THREADS', 4) else: thread_count = 1 threads = [SpiderThread(pending_urls, processed_responses, finished, profile) for _ in range(thread_count)] pending_urls.put((profile.base_url, '', depth)) scheduled.add(profile.base_url) extractor = profile.get_extractor_class() [t.start() for t in threads] processed_url = None try: while 1: try: # pull an item from the response queue result_dict, urls, depth = processed_responses.get(timeout=profile.timeout) except Queue.Empty: #check to see if any of the workers are still doing anything done = True for t in threads: if t.working: print "Thread %s is still working, not exiting" % t done = False if done: print "All threads done working" finished.set() break else: # save the result processed_url = result_dict['url'] raw_content = result_dict['content'] unicode_content = BeautifulSoup.UnicodeDammit(raw_content, isHTML=True).unicode e = extractor(unicode_content) if e.content_type == 'raw_ascii' and not getattr(settings, 'MAGELLAN_INDEX_RAW_ASCII', False): if log: print "Skipping page at url: %s, no means of extracting content" % processed_url continue #don't index title = e.get_title() content = e.get_content() headings = e.get_headings() if log: print "Adding page at url: %s, content length: %s to index" % (processed_url, len(content)) indexer.add_page(url=processed_url, title=title, content=content, site=profile.name, headings=headings) # remove from the list of scheduled items scheduled.remove(processed_url) # store response status in the visited dictionary visited[processed_url] = result_dict['response_status'] # enqueue any urls that need to be checked if depth > 0: for url in urls: if url not in visited and url not in scheduled: scheduled.add(url) pending_urls.put((url, processed_url, depth - 1)) except KeyboardInterrupt: pass except Exception, e: print "Got an exception while indexing page: %s\nWill exit." % processed_url raise