async def handle_redirects(self): """Save copies of target pages for redirect_policy='follow'""" while self.redirecting_tasks: saved_something = False for key, task in list(self.redirecting_tasks.items()): if task.redirects_to.status == TaskStatus.FAILED: # Don't process redirects to a failed pages del self.redirecting_tasks[key] self.done_tasks[task.path] = task continue if task.redirects_to.status != TaskStatus.DONE: continue with await self.saver.open_filename(task.redirects_to.path ) as f: await self.saver.save_to_filename(task.path, f) self.call_hook('page_frozen', hooks.TaskInfo(task)) del self.redirecting_tasks[key] self.done_tasks[task.path] = task saved_something = True if not saved_something: # Get some task (the first one we get by iteration) for the # error message. for task in self.redirecting_tasks.values(): raise InfiniteRedirection(task)
def start_response( self, task, url, wsgi_write, status, headers, exc_info=None, ): """WSGI start_response hook The application we are freezing will call this method and supply the status, headers, exc_info arguments. (self and wsgi_write are provided by freezeyt.) See: https://www.python.org/dev/peps/pep-3333/#the-start-response-callable Arguments: wsgi_write: function that the application can call to output data status: HTTP status line, like '200 OK' headers: HTTP headers (list of tuples) exc_info: Information about a server error, if any. Will be raised if given. """ if exc_info: exc_type, value, traceback = exc_info if value is not None: raise value if self.status_handlers.get(status[:3]): status_handler = self.status_handlers.get(status[:3]) elif self.status_handlers.get(status[0] + 'xx'): status_handler = self.status_handlers.get(status[0] + 'xx') else: raise UnexpectedStatus(url, status, task.reasons) task.response_headers = Headers(headers) task.response_status = status status_action = status_handler(hooks.TaskInfo(task, self)) if status_action == 'save': check_mimetype( url.path, headers, default=self.config.get( 'default_mimetype', 'application/octet-stream', ), ) return wsgi_write elif status_action == 'ignore': raise IgnorePage() elif status_action == 'follow': raise IsARedirect() else: raise UnexpectedStatus(url, status, task.reasons)
async def handle_urls(self): while self.inprogress_tasks: # Get an item from self.inprogress_tasks. # Since this is a dict, we can't do self.inprogress_tasks[0]; # and since we don't want to change it we can't use pop(). # So, start iterating over it, and break the loop immediately # when we get the first item. for path, task in self.inprogress_tasks.items(): break try: await task.asyncio_task except Exception: del self.inprogress_tasks[task.path] self.failed_tasks[task.path] = task self.call_hook('page_failed', hooks.TaskInfo(task)) if path in self.inprogress_tasks: raise ValueError(f'{task} is in_progress after it was handled')
def handle_urls(self): while self.pending_tasks: file_path, task = self.pending_tasks.popitem() self.inprogress_tasks[task.path] = task # Get an URL from the task's set of URLs url_parsed = task.get_a_url() url = url_parsed # url_string should not be needed (except for debug messages) url_string = url_parsed.to_url() path_info = url_parsed.path if path_info.startswith(self.prefix.path): path_info = "/" + path_info[len(self.prefix.path):] environ = { 'SERVER_NAME': self.prefix.ascii_host, 'SERVER_PORT': str(self.prefix.port), 'REQUEST_METHOD': 'GET', 'PATH_INFO': encode_wsgi_path(path_info), 'SCRIPT_NAME': encode_wsgi_path(self.prefix.path), 'SERVER_PROTOCOL': 'HTTP/1.1', 'SERVER_SOFTWARE': 'freezeyt/0.1', 'wsgi.version': (1, 0), 'wsgi.url_scheme': 'http', 'wsgi.input': io.BytesIO(), 'wsgi.errors': sys.stderr, 'wsgi.multithread': False, 'wsgi.multiprocess': False, 'wsgi.run_once': False, 'freezeyt.freezing': True, } # The WSGI application can output data in two ways: # - by a "write" function, which, in our case, will append # any data to a list, `wsgi_write_data` # - (preferably) by returning an iterable object. # See: https://www.python.org/dev/peps/pep-3333/#the-write-callable # Set up the wsgi_write_data, and make its `append` method # available to `start_response` as first argument: wsgi_write_data = [] start_response = functools.partial( self.start_response, task, url, wsgi_write_data.append, ) # Call the application. All calls to write (wsgi_write_data.append) # must be doneas part of this call. try: result_iterable = self.app(environ, start_response) except IsARedirect: continue except IgnorePage: continue # Combine the list of data from write() with the returned # iterable object. full_result = itertools.chain( wsgi_write_data, result_iterable, ) self.saver.save_to_filename(task.path, full_result) try: close = result_iterable.close except AttributeError: pass else: close() with self.saver.open_filename(file_path) as f: content_type = task.response_headers.get('Content-Type') mime_type, encoding = parse_options_header(content_type) url_finder = self.url_finders.get(mime_type) if url_finder is not None: links = url_finder(f, url_string, task.response_headers.to_wsgi_list()) for new_url_text in links: new_url = url.join(decode_input_path(new_url_text)) try: new_url = add_port(new_url) except UnsupportedSchemeError: # If this has a scheme other than http and https, # it's an external url and we don't follow it. pass else: self.add_task( new_url, external_ok=True, reason=f'linked from {url}', ) del self.inprogress_tasks[task.path] self.done_tasks[task.path] = task self.call_hook('page_frozen', hooks.TaskInfo(task, self))
async def handle_content_task(self, task, content): await self.saver.save_to_filename(task.path, [content]) del self.inprogress_tasks[task.path] self.done_tasks[task.path] = task self.call_hook('page_frozen', hooks.TaskInfo(task))