Esempio n. 1
0
def get_links_from_node(node: xml.dom.minidom.Node, base_url) -> list:
    """Get all links from xml.dom.minidom Node."""
    result = []
    if 'href' in node.attrib:
        href = decode_input_path(node.attrib['href'])
        full_url = urljoin(base_url, href)
        result.append(full_url)
    if 'src' in node.attrib:
        href = decode_input_path(node.attrib['src'])
        full_url = urljoin(base_url, href)
        result.append(full_url)
    for child in node:
        result.extend(get_links_from_node(child, base_url))
    return result
Esempio n. 2
0
    def _add_extra_pages(self, prefix, extras):
        """Add URLs of extra pages from config.

        Handles both literal URLs and generators.
        """
        for extra in extras:
            if isinstance(extra, dict):
                try:
                    generator = extra['generator']
                except KeyError:
                    raise ValueError(
                        'extra_pages must be strings or dicts with ' +
                        f'a "generator" key, not `{extra}`')
                if isinstance(generator, str):
                    generator = import_variable_from_module(generator)
                self._add_extra_pages(prefix, generator(self.app))
            elif isinstance(extra, str):
                url = parse_absolute_url(
                    urljoin(prefix, decode_input_path(extra)))
                try:
                    self.add_task(
                        url,
                        reason='extra page',
                    )
                except ExternalURLError:
                    raise ExternalURLError(
                        f'External URL specified in extra_pages: {url}')
            else:
                generator = extra
                self._add_extra_pages(prefix, generator(self.app))
Esempio n. 3
0
def test_decode_input_path_unicode():
    assert decode_input_path('/čau/☺フ') == '/čau/☺フ'
Esempio n. 4
0
def test_decode_input_path_ascii():
    assert decode_input_path('/ahoj') == '/ahoj'
Esempio n. 5
0
def test_decode_input_path_surrogateescape():
    assert decode_input_path('/%8Dau') == '/\udc8dau'
Esempio n. 6
0
def test_decode_input_path_percent():
    assert decode_input_path('/%C4%8Dau') == '/čau'
Esempio n. 7
0
    def handle_urls(self):
        while self.pending_tasks:
            file_path, task = self.pending_tasks.popitem()
            self.inprogress_tasks[task.path] = task

            # Get an URL from the task's set of URLs
            url_parsed = task.get_a_url()
            url = url_parsed

            # url_string should not be needed (except for debug messages)
            url_string = url_parsed.to_url()

            path_info = url_parsed.path

            if path_info.startswith(self.prefix.path):
                path_info = "/" + path_info[len(self.prefix.path):]

            environ = {
                'SERVER_NAME': self.prefix.ascii_host,
                'SERVER_PORT': str(self.prefix.port),
                'REQUEST_METHOD': 'GET',
                'PATH_INFO': encode_wsgi_path(path_info),
                'SCRIPT_NAME': encode_wsgi_path(self.prefix.path),
                'SERVER_PROTOCOL': 'HTTP/1.1',
                'SERVER_SOFTWARE': 'freezeyt/0.1',
                'wsgi.version': (1, 0),
                'wsgi.url_scheme': 'http',
                'wsgi.input': io.BytesIO(),
                'wsgi.errors': sys.stderr,
                'wsgi.multithread': False,
                'wsgi.multiprocess': False,
                'wsgi.run_once': False,
                'freezeyt.freezing': True,
            }

            # The WSGI application can output data in two ways:
            # - by a "write" function, which, in our case, will append
            #   any data to a list, `wsgi_write_data`
            # - (preferably) by returning an iterable object.

            # See: https://www.python.org/dev/peps/pep-3333/#the-write-callable

            # Set up the wsgi_write_data, and make its `append` method
            # available to `start_response` as first argument:
            wsgi_write_data = []
            start_response = functools.partial(
                self.start_response,
                task,
                url,
                wsgi_write_data.append,
            )

            # Call the application. All calls to write (wsgi_write_data.append)
            # must be doneas part of this call.
            try:
                result_iterable = self.app(environ, start_response)
            except IsARedirect:
                continue
            except IgnorePage:
                continue

            # Combine the list of data from write() with the returned
            # iterable object.
            full_result = itertools.chain(
                wsgi_write_data,
                result_iterable,
            )

            self.saver.save_to_filename(task.path, full_result)

            try:
                close = result_iterable.close
            except AttributeError:
                pass
            else:
                close()

            with self.saver.open_filename(file_path) as f:
                content_type = task.response_headers.get('Content-Type')
                mime_type, encoding = parse_options_header(content_type)
                url_finder = self.url_finders.get(mime_type)
                if url_finder is not None:
                    links = url_finder(f, url_string,
                                       task.response_headers.to_wsgi_list())
                    for new_url_text in links:
                        new_url = url.join(decode_input_path(new_url_text))
                        try:
                            new_url = add_port(new_url)
                        except UnsupportedSchemeError:
                            # If this has a scheme other than http and https,
                            # it's an external url and we don't follow it.
                            pass
                        else:
                            self.add_task(
                                new_url,
                                external_ok=True,
                                reason=f'linked from {url}',
                            )

            del self.inprogress_tasks[task.path]
            self.done_tasks[task.path] = task

            self.call_hook('page_frozen', hooks.TaskInfo(task, self))
Esempio n. 8
0
    def __init__(self, app, config):
        self.app = app
        self.config = config

        self.freeze_info = hooks.FreezeInfo(self)

        self.extra_pages = config.get('extra_pages', ())
        self.extra_files = config.get('extra_files', None)

        self.url_finders = parse_handlers(
            config.get('url_finders', DEFAULT_URL_FINDERS),
            default_module='freezeyt.url_finders')

        _status_handlers = dict(DEFAULT_STATUS_HANDLERS,
                                **config.get('status_handlers', {}))
        self.status_handlers = parse_handlers(
            _status_handlers, default_module='freezeyt.status_handlers')

        prefix = config.get('prefix', 'http://localhost:8000/')

        # Decode path in the prefix URL.
        # Save the parsed version of prefix as self.prefix
        prefix_parsed = parse_absolute_url(prefix)
        decoded_path = decode_input_path(prefix_parsed.path)
        if not decoded_path.endswith('/'):
            raise ValueError('prefix must end with /')
        self.prefix = prefix_parsed.replace(path=decoded_path)

        output = config['output']
        if isinstance(output, str):
            output = {'type': 'dir', 'dir': output}

        if output['type'] == 'dict':
            self.saver = DictSaver(self.prefix)
        elif output['type'] == 'dir':
            try:
                output_dir = output['dir']
            except KeyError:
                raise ValueError("output directory not specified")
            self.saver = FileSaver(Path(output_dir), self.prefix)
        else:
            raise ValueError(f"unknown output type {output['type']}")

        self.url_to_path = config.get('url_to_path', default_url_to_path)
        if isinstance(self.url_to_path, str):
            self.url_to_path = import_variable_from_module(self.url_to_path)

        # The tasks for individual pages are tracked in the followng sets
        # (actually dictionaries: {task.path: task})
        # Each task must be in exactly in one of these.
        self.done_tasks = {}
        self.redirecting_tasks = {}
        self.pending_tasks = {}
        self.inprogress_tasks = {}
        self.task_queues = {
            TaskStatus.PENDING: self.pending_tasks,
            TaskStatus.DONE: self.done_tasks,
            TaskStatus.REDIRECTING: self.redirecting_tasks,
            TaskStatus.IN_PROGRESS: self.inprogress_tasks,
        }

        self.add_task(prefix_parsed, reason='site root (homepage)')
        self._add_extra_pages(prefix, self.extra_pages)

        self.hooks = {}
        for name, func in config.get('hooks', {}).items():
            if isinstance(func, str):
                func = import_variable_from_module(func)
            self.hooks[name] = func
Esempio n. 9
0
    def __init__(self, app, config):
        self.app = app
        self.config = config
        self.check_version(self.config.get('version'))

        self.freeze_info = hooks.FreezeInfo(self)

        CONFIG_DATA = (('extra_pages', ()), ('extra_files', None),
                       ('default_mimetype', 'application/octet-stream'),
                       ('get_mimetype', default_mimetype),
                       ('mime_db_file', None), ('url_to_path',
                                                default_url_to_path))
        for attr_name, default in CONFIG_DATA:
            setattr(self, attr_name, config.get(attr_name, default))

        if self.mime_db_file:
            with open(self.mime_db_file) as file:
                mime_db = json.load(file)

            mime_db = convert_mime_db(mime_db)
            self.get_mimetype = functools.partial(mime_db_mimetype, mime_db)

        if isinstance(self.get_mimetype, str):
            self.get_mimetype = import_variable_from_module(self.get_mimetype)

        if isinstance(self.url_to_path, str):
            self.url_to_path = import_variable_from_module(self.url_to_path)

        if config.get('use_default_url_finders', True):
            _url_finders = dict(DEFAULT_URL_FINDERS,
                                **config.get('url_finders', {}))
        else:
            _url_finders = config.get('url_finders', {})

        self.url_finders = parse_handlers(
            _url_finders, default_module='freezeyt.url_finders')

        _status_handlers = dict(DEFAULT_STATUS_HANDLERS,
                                **config.get('status_handlers', {}))
        self.status_handlers = parse_handlers(
            _status_handlers, default_module='freezeyt.status_handlers')
        for key in self.status_handlers:
            if not STATUS_KEY_RE.fullmatch(key):
                raise ValueError(
                    'Status descriptions must be strings with 3 digits or one '
                    + f'digit and "xx", got f{key!r}')

        prefix = config.get('prefix', 'http://localhost:8000/')

        # Decode path in the prefix URL.
        # Save the parsed version of prefix as self.prefix
        prefix_parsed = parse_absolute_url(prefix)
        decoded_path = decode_input_path(prefix_parsed.path)
        if not decoded_path.endswith('/'):
            raise ValueError('prefix must end with /')
        self.prefix = prefix_parsed.replace(path=decoded_path)

        output = config['output']
        if isinstance(output, str):
            output = {'type': 'dir', 'dir': output}

        if output['type'] == 'dict':
            self.saver = DictSaver(self.prefix)
        elif output['type'] == 'dir':
            try:
                output_dir = output['dir']
            except KeyError:
                raise ValueError("output directory not specified")
            self.saver = FileSaver(Path(output_dir), self.prefix)
        else:
            raise ValueError(f"unknown output type {output['type']}")

        # The tasks for individual pages are tracked in the followng sets
        # (actually dictionaries: {task.path: task})
        # Each task must be in exactly in one of these.
        self.done_tasks = {}
        self.redirecting_tasks = {}
        self.inprogress_tasks = {}
        self.failed_tasks = {}
        self.task_queues = {
            TaskStatus.DONE: self.done_tasks,
            TaskStatus.REDIRECTING: self.redirecting_tasks,
            TaskStatus.IN_PROGRESS: self.inprogress_tasks,
            TaskStatus.FAILED: self.failed_tasks,
        }

        try:
            self.add_task(prefix_parsed, reason='site root (homepage)')
            self._add_extra_files()
            self._add_extra_pages(prefix, self.extra_pages)

            self.hooks = {}
            for name, funcs in config.get('hooks', {}).items():
                for func in funcs:
                    if isinstance(func, str):
                        func = import_variable_from_module(func)
                    self.add_hook(name, func)

            for plugin in config.get('plugins', {}):
                if isinstance(plugin, str):
                    plugin = import_variable_from_module(plugin)
                plugin(self.freeze_info)

            self.semaphore = asyncio.Semaphore(MAX_RUNNING_TASKS)
        except:
            self.cancel_tasks()
            raise