def __init__(self, resources_collection, opener, config, worker_pool_size=5): super(WebPageLoader, self).__init__() self._worker_pool = WorkerPool(worker_pool_size) self._resources_collection = resources_collection self._opener = opener logging.config.dictConfig(config['logging']) self._logger = logging.getLogger()
class WebPageLoader(Thread): def __init__(self, resources_collection, opener, config, worker_pool_size=5): super(WebPageLoader, self).__init__() self._worker_pool = WorkerPool(worker_pool_size) self._resources_collection = resources_collection self._opener = opener logging.config.dictConfig(config['logging']) self._logger = logging.getLogger() def run(self): while True: # add the web page processing task for resource in self._resources_collection.find_models(): self._worker_pool.add_task(self._process_web_page, resource) def _process_web_page(self, resource): # if the 'http://' doesn't exist if (resource.uri[:7] != 'http://'): resource.uri = 'http://' + resource.uri entire_content = '' try: handle = self._opener.open(resource.uri) resource.uri = handle.url encoding = detect_header_encoding(handle.headers.dict) entire_content = decode_html(handle.read(), encoding) resource.content = entire_content handle.close() self._logger.info('Reading %s. Success.' % resource.uri) self._enqueue(resource) except (IOError, HTTPException), e: # mark for retry self._logger.error('Reading %s. IO error %s.' % (resource.uri, e)) except UnicodeDecodeError, e: # mark for no more retries self._logger.error('Reading %s. Unicode error %s.' % (resource.uri, e))