def fake_response(link, content, **response_data): """A fake response that can be added to the mirror. """ redirects = response_data.pop('redirects', []) # Use the fake internet system to generate a response object. # This is more reliable than putting on together manually. data = {'stream': content} data.update(response_data) with internet(**{link.original_url: data}): session = TestSession() session.mount('http://', TestAdapter()) session.mount('https://', TestAdapter()) response = session.request('GET', link.original_url) # Additional attributes are expected. This is what the spider # does before passing a link to mirror.add(). Possibly we should # have less code duplication here with the actual spider code. parser_class = get_parser_for_mimetype(get_content_type(response)) if parser_class: response.parsed = parser_class(response.content, response.url, encoding=response.encoding) else: response.parsed = None response.links_parsed = HeaderLinkParser(response) response.redirects = redirects return response
def fake_response(link, content, **response_data): """A fake response that can be added to the mirror. """ redirects = response_data.pop("redirects", []) # Use the fake internet system to generate a response object. # This is more reliable than putting on together manually. data = {"stream": content} data.update(response_data) with internet(**{link.original_url: data}): session = TestSession() session.mount("http://", TestAdapter()) session.mount("https://", TestAdapter()) response = session.request("GET", link.original_url) # Additional attributes are expected. This is what the spider # does before passing a link to mirror.add(). Possibly we should # have less code duplication here with the actual spider code. parser_class = get_parser_for_mimetype(get_content_type(response)) if parser_class: response.parsed = parser_class(response.content, response.url, encoding=response.encoding) else: response.parsed = None response.links_parsed = HeaderLinkParser(response) response.redirects = redirects return response
def add(self, link, response): """Store the given page. """ # Figure out the filename first rel_filename = self.get_filename(link, response) # TODO: Make sure the filename is not outside the cache directory, # avoid issues with servers injecting special path instructions. rel_filename = safe_filename(rel_filename) # Do not allow writing inside the data directory, this would # possibly allow code injection assert not rel_filename.startswith('.track/') # Store the file with self.open(rel_filename, 'wb') as f: f.write(response.content) # We also add a copy that will not be affected by any link # converting for debugging purposes. It'll allow us to validate # via a diff what the conversion is doing. if self.backups: with self.open(path.join('.backups', rel_filename), 'wb') as f: f.write(response.content) # Add to database: data about the url url_info = { 'original_url': link.original_url, 'mimetype': get_content_type(response), 'etag': response.headers.get('etag'), 'encoding': response.encoding, 'last-modified': response.headers.get('last-modified'), 'expires': parse_http_date_header(response.headers.get('expires')), 'links': [] } for url, info in itertools.chain( response.links_parsed, response.parsed or ()): try: url_info['links'].append((Link(url).url, info)) except urlnorm.InvalidUrl: pass self.url_info[link.url] = url_info # The url itself self.encountered_urls[link.url] = rel_filename self.stored_urls.setdefault(link.url, set()) self.stored_urls[link.url] |= {rel_filename} # Be sure to to update the reverse cache self._insert_into_url_usage(link.url, url_info['links']) # Make sure database is saved self.flush() # See if we should apply modifications now (as opposed to waiting # until the last response has been added). if self.write_at_once: self._convert_links(link.url) self._create_index()
def add(self, link, response): """Store the given page. """ # Figure out the filename first rel_filename = self.get_filename(link, response) # TODO: Make sure the filename is not outside the cache directory, # avoid issues with servers injecting special path instructions. rel_filename = safe_filename(rel_filename) # Do not allow writing inside the data directory, this would # possibly allow code injection assert not rel_filename.startswith('.track/') # Store the file with self.open(rel_filename, 'wb') as f: f.write(response.content) # We also add a copy that will not be affected by any link # converting for debugging purposes. It'll allow us to validate # via a diff what the conversion is doing. if self.backups: with self.open(path.join('.backups', rel_filename), 'wb') as f: f.write(response.content) # Add to database: data about the url url_info = { 'original_url': link.original_url, 'mimetype': get_content_type(response), 'etag': response.headers.get('etag'), 'encoding': response.encoding, 'last-modified': response.headers.get('last-modified'), 'expires': parse_http_date_header(response.headers.get('expires')), 'links': [] } for url, info in itertools.chain(response.links_parsed, response.parsed or ()): try: url_info['links'].append((Link(url).url, info)) except urlnorm.InvalidUrl: pass self.url_info[link.url] = url_info # The url itself self.encountered_urls[link.url] = rel_filename self.stored_urls.setdefault(link.url, set()) self.stored_urls[link.url] |= {rel_filename} # Be sure to to update the reverse cache self._insert_into_url_usage(link.url, url_info['links']) # Make sure database is saved self.flush() # See if we should apply modifications now (as opposed to waiting # until the last response has been added). if self.write_at_once: self._convert_links(link.url) self._create_index()
def content_type(link, ctx): """Match against the content type of the url. A content type might be ``text/html`` or ``image/png``. Note: This will execute a HEAD request to the url to determine the content type. """ response = link.resolve(ctx['spider'], 'head') if not response: return None return get_content_type(response)
def get_filename(self, link, response): """Determine the filename under which to store a URL. This is designed for subclasses to be able to provide a custom implementation. They do not need to care about making the filename safe for the filesystem. """ link = response.url parsed = urlparse(link) # Prefix the domain to the filename filename = path.join(parsed.netloc, parsed.path[1:]) # If we are dealing with a trailing-slash, create an index file # in a directory - extension added later. if filename.endswith(path.sep): filename = path.join(filename, 'index') # Chose the right file extension mime = get_content_type(response) extensions = mimetypes.guess_all_extensions(mime, strict=False) # Set our extension if the existing one does not match the mime type name_wo_ext, current_extension = path.splitext(filename) if not current_extension or current_extension not in extensions: extensions.sort(reverse=True) if extensions: filename = '{0}{1}'.format(name_wo_ext, extensions[0]) # If there is a query string, insert it before the extension if parsed.query: base, ext = path.splitext(filename) hash = hashlib.md5(parsed.query.encode()).hexdigest().lower()[:10] filename = '{}_{}{}'.format(base, hash, ext) return filename