def process_response(self, request, response, spider): #log.msg('%s is type %s' % (response.url, type(response)), level=log.DEBUG) if type(response) is Response and not _file_pattern.match(response.url): response = HtmlResponse(response.url, body=response.body) if hasattr(response, 'body_as_unicode'): hdoc = html.fromstring(response.body_as_unicode()) links = hdoc.xpath('//a') for link in links: href = link.get('href') link.set('href', urlparse.urljoin(get_base_url(response), href) ) return response.replace(body=html.tostring(hdoc, encoding='unicode')) return response