def fetch_and_store(url): try: (title, text) = fetch_and_clean(url) except Exception: raise Http404(url) doc = SearchDocument() doc.url = url doc.title = title doc.text = text doc.save() return doc
def search_against_url(request, url): """ Accepts a URL as either a suffix of the URI or a POST request parameter. Downloads the content, feeds it through the readability article grabber, then submits the article text to superfastmatch for comparison. """ (scheme, _1, _2, _3, _4, _5) = urlparse(url) if scheme not in ('http', 'https'): return search_page( request, error='The URL must begin with either http or https.') sfm = from_django_conf('sidebyside') try: (title, text) = fetch_and_clean(url) except requests.exceptions.Timeout: return search_page( request, error="Sorry, that news article couldn't be retrieved.") try: sfm_results = sfm.search(text=text, title=title, url=url) drop_silly_results(sfm_results) sort_by_coverage(sfm_results) #if they submit a url, don't return the exact same url in the results for r in sfm_results['documents']['rows']: if r.get('url') == url: sfm_results['documents']['rows'].remove(r) if sfm_results.has_key('text'): text = sfm_results['text'] else: text = '' if sfm_results.has_key('title'): title = sfm_results['title'] else: title = 'No Title' return search_result_page(request, sfm_results, text, source_title=title, source_url=url) except superfastmatch.SuperFastMatchError, e: if e.status == httplib.NOT_FOUND: raise HttpResponse('No such article {0}'.format(url)) elif settings.DEBUG == True: return HttpResponse(e.response[1], status=e.response[0]) else: raise
def search_against_url(request, url): """ Accepts a URL as either a suffix of the URI or a POST request parameter. Downloads the content, feeds it through the readability article grabber, then submits the article text to superfastmatch for comparison. """ (scheme, _1, _2, _3, _4, _5) = urlparse(url) if scheme not in ('http', 'https'): return search_page(request, error='The URL must begin with either http or https.') sfm = from_django_conf('sidebyside') try: (title, text) = fetch_and_clean(url) except requests.exceptions.Timeout: return search_page(request, error="Sorry, that news article couldn't be retrieved.") try: sfm_results = sfm.search(text=text, title=title, url=url) drop_silly_results(sfm_results) sort_by_coverage(sfm_results) #if they submit a url, don't return the exact same url in the results for r in sfm_results['documents']['rows']: if r.get('url') == url: sfm_results['documents']['rows'].remove(r) if sfm_results.has_key('text'): text = sfm_results['text'] else: text = '' if sfm_results.has_key('title'): title = sfm_results['title'] else: title='No Title' return search_result_page(request, sfm_results, text, source_title=title, source_url=url) except superfastmatch.SuperFastMatchError, e: if e.status == httplib.NOT_FOUND: raise HttpResponse('No such article {0}'.format(url)) elif settings.DEBUG == True: return HttpResponse(e.response[1], status=e.response[0]) else: raise