Example #1
0
def fake_response(link, content, **response_data):
    """A fake response that can be added to the mirror.
    """
    redirects = response_data.pop('redirects', [])
    # Use the fake internet system to generate a response object.
    # This is more reliable than putting on together manually.
    data = {'stream': content}
    data.update(response_data)
    with internet(**{link.original_url: data}):
        session = TestSession()
        session.mount('http://', TestAdapter())
        session.mount('https://', TestAdapter())
        response = session.request('GET', link.original_url)

    # Additional attributes are expected. This is what the spider
    # does before passing a link to mirror.add(). Possibly we should
    # have less code duplication here with the actual spider code.
    parser_class = get_parser_for_mimetype(get_content_type(response))
    if parser_class:
        response.parsed = parser_class(response.content,
                                       response.url,
                                       encoding=response.encoding)
    else:
        response.parsed = None
    response.links_parsed = HeaderLinkParser(response)
    response.redirects = redirects
    return response
Example #2
0
def fake_response(link, content, **response_data):
    """A fake response that can be added to the mirror.
    """
    redirects = response_data.pop("redirects", [])
    # Use the fake internet system to generate a response object.
    # This is more reliable than putting on together manually.
    data = {"stream": content}
    data.update(response_data)
    with internet(**{link.original_url: data}):
        session = TestSession()
        session.mount("http://", TestAdapter())
        session.mount("https://", TestAdapter())
        response = session.request("GET", link.original_url)

    # Additional attributes are expected. This is what the spider
    # does before passing a link to mirror.add(). Possibly we should
    # have less code duplication here with the actual spider code.
    parser_class = get_parser_for_mimetype(get_content_type(response))
    if parser_class:
        response.parsed = parser_class(response.content, response.url, encoding=response.encoding)
    else:
        response.parsed = None
    response.links_parsed = HeaderLinkParser(response)
    response.redirects = redirects
    return response
Example #3
0
    def add(self, link, response):
        """Store the given page.
        """
        # Figure out the filename first
        rel_filename = self.get_filename(link, response)
        # TODO: Make sure the filename is not outside the cache directory,
        # avoid issues with servers injecting special path instructions.
        rel_filename = safe_filename(rel_filename)
        # Do not allow writing inside the data directory, this would
        # possibly allow code injection
        assert not rel_filename.startswith('.track/')

        # Store the file
        with self.open(rel_filename, 'wb') as f:
            f.write(response.content)

        # We also add a copy that will not be affected by any link
        # converting for debugging purposes. It'll allow us to validate
        # via a diff what the conversion is doing.
        if self.backups:
            with self.open(path.join('.backups', rel_filename), 'wb') as f:
                f.write(response.content)

        # Add to database: data about the url
        url_info = {
            'original_url': link.original_url,
            'mimetype': get_content_type(response),
            'etag': response.headers.get('etag'),
            'encoding': response.encoding,
            'last-modified': response.headers.get('last-modified'),
            'expires': parse_http_date_header(response.headers.get('expires')),
            'links': []
        }
        for url, info in itertools.chain(
                response.links_parsed,
                response.parsed or ()):
            try:
                url_info['links'].append((Link(url).url, info))
            except urlnorm.InvalidUrl:
                pass
        self.url_info[link.url] = url_info
        # The url itself
        self.encountered_urls[link.url] = rel_filename
        self.stored_urls.setdefault(link.url, set())
        self.stored_urls[link.url] |= {rel_filename}
        # Be sure to to update the reverse cache
        self._insert_into_url_usage(link.url, url_info['links'])
        # Make sure database is saved
        self.flush()

        # See if we should apply modifications now (as opposed to waiting
        # until the last response has been added).
        if self.write_at_once:
            self._convert_links(link.url)
            self._create_index()
Example #4
0
    def add(self, link, response):
        """Store the given page.
        """
        # Figure out the filename first
        rel_filename = self.get_filename(link, response)
        # TODO: Make sure the filename is not outside the cache directory,
        # avoid issues with servers injecting special path instructions.
        rel_filename = safe_filename(rel_filename)
        # Do not allow writing inside the data directory, this would
        # possibly allow code injection
        assert not rel_filename.startswith('.track/')

        # Store the file
        with self.open(rel_filename, 'wb') as f:
            f.write(response.content)

        # We also add a copy that will not be affected by any link
        # converting for debugging purposes. It'll allow us to validate
        # via a diff what the conversion is doing.
        if self.backups:
            with self.open(path.join('.backups', rel_filename), 'wb') as f:
                f.write(response.content)

        # Add to database: data about the url
        url_info = {
            'original_url': link.original_url,
            'mimetype': get_content_type(response),
            'etag': response.headers.get('etag'),
            'encoding': response.encoding,
            'last-modified': response.headers.get('last-modified'),
            'expires': parse_http_date_header(response.headers.get('expires')),
            'links': []
        }
        for url, info in itertools.chain(response.links_parsed, response.parsed
                                         or ()):
            try:
                url_info['links'].append((Link(url).url, info))
            except urlnorm.InvalidUrl:
                pass
        self.url_info[link.url] = url_info
        # The url itself
        self.encountered_urls[link.url] = rel_filename
        self.stored_urls.setdefault(link.url, set())
        self.stored_urls[link.url] |= {rel_filename}
        # Be sure to to update the reverse cache
        self._insert_into_url_usage(link.url, url_info['links'])
        # Make sure database is saved
        self.flush()

        # See if we should apply modifications now (as opposed to waiting
        # until the last response has been added).
        if self.write_at_once:
            self._convert_links(link.url)
            self._create_index()
Example #5
0
    def content_type(link, ctx):
        """Match against the content type of the url.

        A content type might be ``text/html`` or ``image/png``.

        Note: This will execute a HEAD request to the url to determine
        the content type.
        """
        response = link.resolve(ctx['spider'], 'head')
        if not response:
            return None
        return get_content_type(response)
Example #6
0
    def content_type(link, ctx):
        """Match against the content type of the url.

        A content type might be ``text/html`` or ``image/png``.

        Note: This will execute a HEAD request to the url to determine
        the content type.
        """
        response = link.resolve(ctx['spider'], 'head')
        if not response:
            return None
        return get_content_type(response)
Example #7
0
    def get_filename(self, link, response):
        """Determine the filename under which to store a URL.

        This is designed for subclasses to be able to provide a custom
        implementation. They do not need to care about making the
        filename safe for the filesystem.
        """
        link = response.url

        parsed = urlparse(link)

        # Prefix the domain to the filename
        filename = path.join(parsed.netloc, parsed.path[1:])

        # If we are dealing with a trailing-slash, create an index file
        # in a directory - extension added later.
        if filename.endswith(path.sep):
            filename = path.join(filename, 'index')

        # Chose the right file extension
        mime = get_content_type(response)
        extensions = mimetypes.guess_all_extensions(mime, strict=False)
        # Set our extension if the existing one does not match the mime type
        name_wo_ext, current_extension = path.splitext(filename)
        if not current_extension or current_extension not in extensions:
            extensions.sort(reverse=True)
            if extensions:
                filename = '{0}{1}'.format(name_wo_ext, extensions[0])

        # If there is a query string, insert it before the extension
        if parsed.query:
            base, ext = path.splitext(filename)
            hash = hashlib.md5(parsed.query.encode()).hexdigest().lower()[:10]
            filename = '{}_{}{}'.format(base, hash, ext)

        return filename
Example #8
0
    def get_filename(self, link, response):
        """Determine the filename under which to store a URL.

        This is designed for subclasses to be able to provide a custom
        implementation. They do not need to care about making the
        filename safe for the filesystem.
        """
        link = response.url

        parsed = urlparse(link)

        # Prefix the domain to the filename
        filename = path.join(parsed.netloc, parsed.path[1:])

        # If we are dealing with a trailing-slash, create an index file
        # in a directory - extension added later.
        if filename.endswith(path.sep):
            filename = path.join(filename, 'index')

        # Chose the right file extension
        mime = get_content_type(response)
        extensions = mimetypes.guess_all_extensions(mime, strict=False)
        # Set our extension if the existing one does not match the mime type
        name_wo_ext, current_extension = path.splitext(filename)
        if not current_extension or current_extension not in extensions:
            extensions.sort(reverse=True)
            if extensions:
                filename = '{0}{1}'.format(name_wo_ext, extensions[0])

        # If there is a query string, insert it before the extension
        if parsed.query:
            base, ext = path.splitext(filename)
            hash = hashlib.md5(parsed.query.encode()).hexdigest().lower()[:10]
            filename = '{}_{}{}'.format(base, hash, ext)

        return filename