def get_links_from_css(self, style_text, item): ''' This function extracts urls from css style text and returns requests for download thees images. Also in this function we are replacing urls to absolute uri to replace it by local url ''' response = item['response'] sheet = CSSStyleSheet() sheet.cssText = style_text urls = cssutils.getUrls(sheet) requests = [] item_content = item['content'] for url in urls: request_url = response.url.replace('http://', '') if url[0] == '/': request_url = request_url.split('/')[0] + url else: request_url = request_url.split('/') request_url[-1] = url request_url = '/'.join(request_url) request_url = 'http://%s' % request_url item_content = item_content.replace(url, request_url) requests.append(Request(request_url)) item['content'] = item_content return requests
def parse_css_stylesheet(content): from datetime import datetime start = datetime.now() sheet = CSSStyleSheet() try: sheet.cssText = content except Exception: # Parsing failed parser.process_content(content, contexts.CSS_UNKNOWN) for rule in sheet.cssRules: parse_css_rule(rule) end = datetime.now() library.css_us += end - start
def get_media_requests(self, item, info): sheet = CSSStyleSheet() sheet.cssText = item['content'] urls = cssutils.getUrls(sheet) return [Request(u) for u in urls]