Exemple #1
0
    async def handle_redirects(self):
        """Save copies of target pages for redirect_policy='follow'"""
        while self.redirecting_tasks:
            saved_something = False
            for key, task in list(self.redirecting_tasks.items()):
                if task.redirects_to.status == TaskStatus.FAILED:
                    # Don't process redirects to a failed pages
                    del self.redirecting_tasks[key]
                    self.done_tasks[task.path] = task
                    continue
                if task.redirects_to.status != TaskStatus.DONE:
                    continue

                with await self.saver.open_filename(task.redirects_to.path
                                                    ) as f:
                    await self.saver.save_to_filename(task.path, f)
                self.call_hook('page_frozen', hooks.TaskInfo(task))
                del self.redirecting_tasks[key]
                self.done_tasks[task.path] = task
                saved_something = True
            if not saved_something:
                # Get some task (the first one we get by iteration) for the
                # error message.
                for task in self.redirecting_tasks.values():
                    raise InfiniteRedirection(task)
Exemple #2
0
    def start_response(
        self,
        task,
        url,
        wsgi_write,
        status,
        headers,
        exc_info=None,
    ):
        """WSGI start_response hook

        The application we are freezing will call this method
        and supply the status, headers, exc_info arguments.
        (self and wsgi_write are provided by freezeyt.)

        See: https://www.python.org/dev/peps/pep-3333/#the-start-response-callable

        Arguments:
            wsgi_write: function that the application can call to output data
            status: HTTP status line, like '200 OK'
            headers: HTTP headers (list of tuples)
            exc_info: Information about a server error, if any.
                Will be raised if given.
        """
        if exc_info:
            exc_type, value, traceback = exc_info
            if value is not None:
                raise value

        if self.status_handlers.get(status[:3]):
            status_handler = self.status_handlers.get(status[:3])
        elif self.status_handlers.get(status[0] + 'xx'):
            status_handler = self.status_handlers.get(status[0] + 'xx')
        else:
            raise UnexpectedStatus(url, status, task.reasons)

        task.response_headers = Headers(headers)
        task.response_status = status

        status_action = status_handler(hooks.TaskInfo(task, self))

        if status_action == 'save':
            check_mimetype(
                url.path,
                headers,
                default=self.config.get(
                    'default_mimetype',
                    'application/octet-stream',
                ),
            )
            return wsgi_write
        elif status_action == 'ignore':
            raise IgnorePage()
        elif status_action == 'follow':
            raise IsARedirect()
        else:
            raise UnexpectedStatus(url, status, task.reasons)
Exemple #3
0
 async def handle_urls(self):
     while self.inprogress_tasks:
         # Get an item from self.inprogress_tasks.
         # Since this is a dict, we can't do self.inprogress_tasks[0];
         # and since we don't want to change it we can't use pop().
         # So, start iterating over it, and break the loop immediately
         # when we get the first item.
         for path, task in self.inprogress_tasks.items():
             break
         try:
             await task.asyncio_task
         except Exception:
             del self.inprogress_tasks[task.path]
             self.failed_tasks[task.path] = task
             self.call_hook('page_failed', hooks.TaskInfo(task))
         if path in self.inprogress_tasks:
             raise ValueError(f'{task} is in_progress after it was handled')
Exemple #4
0
    def handle_urls(self):
        while self.pending_tasks:
            file_path, task = self.pending_tasks.popitem()
            self.inprogress_tasks[task.path] = task

            # Get an URL from the task's set of URLs
            url_parsed = task.get_a_url()
            url = url_parsed

            # url_string should not be needed (except for debug messages)
            url_string = url_parsed.to_url()

            path_info = url_parsed.path

            if path_info.startswith(self.prefix.path):
                path_info = "/" + path_info[len(self.prefix.path):]

            environ = {
                'SERVER_NAME': self.prefix.ascii_host,
                'SERVER_PORT': str(self.prefix.port),
                'REQUEST_METHOD': 'GET',
                'PATH_INFO': encode_wsgi_path(path_info),
                'SCRIPT_NAME': encode_wsgi_path(self.prefix.path),
                'SERVER_PROTOCOL': 'HTTP/1.1',
                'SERVER_SOFTWARE': 'freezeyt/0.1',
                'wsgi.version': (1, 0),
                'wsgi.url_scheme': 'http',
                'wsgi.input': io.BytesIO(),
                'wsgi.errors': sys.stderr,
                'wsgi.multithread': False,
                'wsgi.multiprocess': False,
                'wsgi.run_once': False,
                'freezeyt.freezing': True,
            }

            # The WSGI application can output data in two ways:
            # - by a "write" function, which, in our case, will append
            #   any data to a list, `wsgi_write_data`
            # - (preferably) by returning an iterable object.

            # See: https://www.python.org/dev/peps/pep-3333/#the-write-callable

            # Set up the wsgi_write_data, and make its `append` method
            # available to `start_response` as first argument:
            wsgi_write_data = []
            start_response = functools.partial(
                self.start_response,
                task,
                url,
                wsgi_write_data.append,
            )

            # Call the application. All calls to write (wsgi_write_data.append)
            # must be doneas part of this call.
            try:
                result_iterable = self.app(environ, start_response)
            except IsARedirect:
                continue
            except IgnorePage:
                continue

            # Combine the list of data from write() with the returned
            # iterable object.
            full_result = itertools.chain(
                wsgi_write_data,
                result_iterable,
            )

            self.saver.save_to_filename(task.path, full_result)

            try:
                close = result_iterable.close
            except AttributeError:
                pass
            else:
                close()

            with self.saver.open_filename(file_path) as f:
                content_type = task.response_headers.get('Content-Type')
                mime_type, encoding = parse_options_header(content_type)
                url_finder = self.url_finders.get(mime_type)
                if url_finder is not None:
                    links = url_finder(f, url_string,
                                       task.response_headers.to_wsgi_list())
                    for new_url_text in links:
                        new_url = url.join(decode_input_path(new_url_text))
                        try:
                            new_url = add_port(new_url)
                        except UnsupportedSchemeError:
                            # If this has a scheme other than http and https,
                            # it's an external url and we don't follow it.
                            pass
                        else:
                            self.add_task(
                                new_url,
                                external_ok=True,
                                reason=f'linked from {url}',
                            )

            del self.inprogress_tasks[task.path]
            self.done_tasks[task.path] = task

            self.call_hook('page_frozen', hooks.TaskInfo(task, self))
Exemple #5
0
 async def handle_content_task(self, task, content):
     await self.saver.save_to_filename(task.path, [content])
     del self.inprogress_tasks[task.path]
     self.done_tasks[task.path] = task
     self.call_hook('page_frozen', hooks.TaskInfo(task))