def store_request(read): """Store a readable document based off a ReadableRequest instance.""" url = read.final_url hash_id = generate_hash(url) content = Article(read.content, url=url) readable_article = content.readable try: readable_title = content._original_document.title except AttributeError as exc: LOG.error(str(exc)) readable_title = 'Unknown' # Json encoding a requests response breaks due to the CaseInsitiveDict # in use. request_info = { 'content_type': read.content_type, 'domain': read.domain, 'final_url': read.final_url, 'headers': dict(read.headers), 'is_error': read.is_error, 'request_time': read.request_time, 'start_time': str(read.start_time), 'status_code': read.status_code, 'status_message': read.status_message, 'url': read.url, } page = WebPage( hash_id=hash_id, readable=readable_article, request=request_info, title=readable_title, url=url, ) server.set(hash_id, json.dumps(dict(page))) # If the url and the final url are not the same then store an extra # record pointing the original url to the final url record. if read.url != read.final_url: server.set( generate_hash(read.url), json.dumps({ 'reference': hash_id }) ) return page
def test_cached_webpage(self): """When we readable parse we cache the data in redis.""" url = 'http://www.google.com/intl/en/about/index.html' hashed = generate_hash(url) resp = self.app.get( '/v', params={ 'url': url }, status=302) # follow the redirect and we land at the actual page. resp = resp.follow() from bookie_parser.models import server # Make sure the data exists in redis self.assertTrue(server.get(hashed), 'The key is found.') # Now hit up our redis server and find what data we've stored. data = WebPageMgr.get(hash_id=hashed) self.assertEqual( url, data.url, "The url is stored in the root object") self.assertEqual( hashed, data.hash_id, "The hash is stored in the root object") self.assertTrue( data.request is not None, 'The request is stored in the cache.') self.assertEqual( u'Google - About Google', data.title) self.assertTrue(data.readable is not None)
def exists(hash_id=None, url=None): if hash_id is None and url is not None: url = url.strip('/') hash_id = generate_hash(url) if server.exists(hash_id): return hash_id else: return None
def test_viewable_response(self): """Make sure we can load and get a html response correctly.""" url = 'http://www.google.com/intl/en/about/index.html' hashed = generate_hash(url) resp = self.app.get( '/v', params={ 'url': url }, status=302) # follow the redirect and we land at the actual page. resp = resp.follow() body = resp.body.decode('utf8') self.assertTrue( resp.request.url.endswith(hashed), 'the url should end with the url hash') self.assertIn( "google.com", body, 'we should find google in the body. ' + body)
def __init__(self, hash_id=None, url=None, readable=None, title=None, request=None, final_url=None): """Create a new WebPage data instance.""" if url: self.url = url if not hash_id: # Generate a new hash id self.hash_id = generate_hash(url) if hash_id: self.hash_id = hash_id if url: self.url = url if readable: self.readable = readable if request: self.request = request if self.request['is_error']: self.is_error = True if final_url: self.final_url = final_url if title: self.title = title