def run(self): # Step 0: ensure that the document_root and base_path variables are # set. If the file that's being processed was inside a source that has # either one or both not set, then this processor can't run. if self.document_root is None or self.base_path is None: raise DocumentRootAndBasePathRequiredException # We don't rename the file, so we can use the default output file. parser = CSSParser(log=None, loglevel=logging.CRITICAL) sheet = parser.parseFile(self.input_file) # Step 1: ensure the file has URLs. If it doesn't, we can stop the # processing. url_count = 0 for url in getUrls(sheet): url_count += 1 break if url_count == 0: return self.input_file # Step 2: resolve the relative URLs to absolute paths. replaceUrls(sheet, self.resolveToAbsolutePath) # Step 3: verify that each of these files has been synced. synced_files_db = urljoin(sys.path[0] + os.sep, SYNCED_FILES_DB) self.dbcon = sqlite3.connect(synced_files_db) self.dbcon.text_factory = unicode # This is the default, but we set it explicitly, just to be sure. self.dbcur = self.dbcon.cursor() all_synced = True for urlstring in getUrls(sheet): # Skip absolute URLs. if urlstring.startswith("http://") or urlstring.startswith("https://"): continue # Skip broken references in the CSS file. This would otherwise # prevent this CSS file from ever passing through this processor. if not os.path.exists(urlstring): continue # Get the CDN URL for the given absolute path. self.dbcur.execute("SELECT url FROM synced_files WHERE input_file=?", (urlstring,)) result = self.dbcur.fetchone() if result == None: raise RequestToRequeueException( "The file '%s' has not yet been synced to the server '%s'" % (urlstring, self.process_for_server) ) else: cdn_url = result[0] # Step 4: resolve the absolute paths to CDN URLs. replaceUrls(sheet, self.resolveToCDNURL) # Step 5: write the updated CSS to the output file. f = open(self.output_file, "w") f.write(sheet.cssText) f.close() return self.output_file
def clone_all_css_resources(css, current_directory, root_url, root_directory): #Clone all resources associated with each url css_sheet = cssutils.parseString(css) resource_urls = cssutils.getUrls(css_sheet) for url in resource_urls: if url not in resource_url_list: resource_url_list.append(url) #Create any required new directories for this url url = urlparse.urljoin(root_url, url) css_directory = check_create_directory(url, current_directory, root_directory) file_name = urlparse.urlparse(url).path.split('/')[-1] #Save this file to the directory try: output_file_directory = os.path.join(css_directory, file_name) urllib.urlretrieve(url, output_file_directory) except: print 'io error' print url print output_file_directory #raise Exception("UrlLib Error: writing file %s" % os.path.join(css_directory, file_name) ) sys.exit(0) print("%s cloned..." % url)
def get_links_from_css(self, style_text, item): ''' This function extracts urls from css style text and returns requests for download thees images. Also in this function we are replacing urls to absolute uri to replace it by local url ''' response = item['response'] sheet = CSSStyleSheet() sheet.cssText = style_text urls = cssutils.getUrls(sheet) requests = [] item_content = item['content'] for url in urls: request_url = response.url.replace('http://', '') if url[0] == '/': request_url = request_url.split('/')[0] + url else: request_url = request_url.split('/') request_url[-1] = url request_url = '/'.join(request_url) request_url = 'http://%s' % request_url item_content = item_content.replace(url, request_url) requests.append(Request(request_url)) item['content'] = item_content return requests
def process(self): if 'fonts' in self.link: # Omit google fonts self.tag.decompose() return # Parse urls in css (using parseString because it is much faster than parseUrl) style_sheet = downloader.read(self.link).decode('utf-8-sig', errors='ignore') sheet = cssutils.parseString(style_sheet) for css_url in cssutils.getUrls(sheet): if not css_url.startswith('data:image') and not css_url.startswith( 'data:application'): try: style_sheet = style_sheet.replace( css_url, os.path.basename( self.write_url(css_url, url=self.link, default_ext='.png'))) except BROKEN_EXCEPTIONS as e: LOGGER.warn( 'Unable to download stylesheet url at {} ({})'.format( self.url, str(e))) self.tag[self.attribute] = self.format_url( self.write_contents(self.get_filename(self.link), style_sheet)) return self.tag[self.attribute]
def iterlinks(self, name, get_line_numbers=True): ''' Iterate over all links in name. If get_line_numbers is True the yields results of the form (link, line_number, offset). Where line_number is the line_number at which the link occurs and offset is the number of characters from the start of the line. Note that offset could actually encompass several lines if not zero. ''' media_type = self.mime_map.get(name, guess_type(name)) if name == self.opf_name: for elem in self.opf_xpath('//*[@href]'): yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href') elif media_type.lower() in OEB_DOCS: for el, attr, link, pos in iterlinks(self.parsed(name)): yield (link, el.sourceline, pos) if get_line_numbers else link elif media_type.lower() in OEB_STYLES: if get_line_numbers: with self.open(name, 'rb') as f: raw = self.decode(f.read()).replace('\r\n', '\n').replace('\r', '\n') position = PositionFinder(raw) is_in_comment = CommentFinder(raw) for link, offset in itercsslinks(raw): if not is_in_comment(offset): lnum, col = position(offset) yield link, lnum, col else: for link in getUrls(self.parsed(name)): yield link elif media_type.lower() == guess_type('toc.ncx'): for elem in self.parsed(name).xpath('//*[@src]'): yield (elem.get('src'), elem.sourceline, 0) if get_line_numbers else elem.get('src')
def geturls(stylesheet): """ Return a list of all URLs appearing in the :class:`CSSStyleSheet` :obj:`stylesheet`. """ return [url.URL(u) for u in cssutils.getUrls(stylesheet) ] # This requires cssutils 0.9.5b1
def parse(self, response): items = ScraphubItem() tempfolder = tempfile.TemporaryDirectory(dir = self.service_config['temp_dir']) for deal_ in response.css('div.panel-body'): metadata = {} metadata['source'] = self.start_urls url = None image_name = None ccssrc_ = deal_.css('div.deal-image').get() if ccssrc_ is not None: soup = BeautifulSoup(ccssrc_) div_style = soup.find('div')['style'] sheet = cssutils.css.CSSStyleSheet() sheet.add("dummy_selector { %s }" % div_style) url = list(cssutils.getUrls(sheet))[0] image_name = os.path.basename(urlparse(url).path) opener = urllib.request.URLopener() opener.addheader('User-Agent', 'Mozilla/5.0') filename, headers = opener.retrieve(url, '%s/'%(tempfolder.name) + image_name) BucketUploader.upload_blob('%s/'%(tempfolder.name) + image_name, self.service_config['bucket_PATH'] + str(image_name)) items['description'] = deal_.css('a::text').get() items['discount'] = deal_.css('div.deal-discount::text').get() items['imgurl'] = self.service_config['bucket_baseURL'] + str(image_name) items['base_amount'] = deal_.css('small::text').get() #recorded as a string this need to fixed for intiger items['source_href'] = GlobalVariable.urlRegx(deal_.css('a').get()) # This need to handel properly items['fetched_date'] = datetime.datetime.today() # print(deal_.css('div.col-sm-3').get()) # metadata['expire'] = deal_.css('span.expirydate::text').get() items['meta'] = json.dumps(metadata) items['is_automated'] = True items['postid'] = str(uuid.uuid1()) items['expire_date'] = datetime.datetime.today() # items['current_amount'] = deal_.css('span::text').get() yield items tempfolder.cleanup()
def process(self, files): for f in files: if not f['resource_path'].endswith('.css') or f['type'] != 'file': yield f continue self._counter += 1 fs_rpath = f['filesystem_path'] sheet = cssutils.parseFile(fs_rpath) sheet.setSerializer(self.serializer) for url in cssutils.getUrls(sheet): u = urlparse(url) if u.scheme or u.netloc or not u.path.startswith('./'): logging.warning('non-relative URL used in CSS: %s' % url) if self.resolve_imports: sheet = cssutils.resolveImports(sheet) target = os.path.join( self._tmpdir, str(self._counter) + '-' + os.path.basename(fs_rpath)) out_f = open(target, 'wb') try: out_f.write(sheet.cssText) finally: out_f.close() f['filesystem_path'] = target yield f
def test_getUrls(self): "cssutils.getUrls()" cssutils.ser.prefs.keepAllProperties = True css = ''' @import "im1"; @import url(im2); @import url( im3 ); @import url( "im4" ); @import url( 'im5' ); a { background-image: url(a) !important; background-\image: url(b); background: url(c) no-repeat !important; /* issue #46 */ src: local("xx"), url("f.woff") format("woff"), url("f.otf") format("opentype"), url("f.svg#f") format("svg"); }''' urls = set(cssutils.getUrls(cssutils.parseString(css))) self.assertEqual( urls, set([ "im1", "im2", "im3", "im4", "im5", "a", "b", "c", u'f.woff', u'f.svg#f', u'f.otf' ])) cssutils.ser.prefs.keepAllProperties = False
def test_getUrls(self): "cssutils.getUrls()" cssutils.ser.prefs.keepAllProperties = True css=''' @import "im1"; @import url(im2); @import url( im3 ); @import url( "im4" ); @import url( 'im5' ); a { background-image: url(a) !important; background-\image: url(b); background: url(c) no-repeat !important; /* issue #46 */ src: local("xx"), url("f.woff") format("woff"), url("f.otf") format("opentype"), url("f.svg#f") format("svg"); }''' urls = set(cssutils.getUrls(cssutils.parseString(css))) self.assertEqual(urls, set(["im1", "im2", "im3", "im4", "im5", "a", "b", "c", u'f.woff', u'f.svg#f', u'f.otf'])) cssutils.ser.prefs.keepAllProperties = False
def _get_css_imports_cssutils(data, inline=False): """Return all assets that are referenced in the given CSS document. The returned URLs are relative to the stylesheet's URL. Args: data: The content of the stylesheet to scan as string. inline: True if the argument is an inline HTML style attribute. """ try: import cssutils except (ImportError, re.error): # Catching re.error because cssutils in earlier releases (<= 1.0) is # broken on Python 3.5 # See https://bitbucket.org/cthedot/cssutils/issues/52 return None # We don't care about invalid CSS data, this will only litter the log # output with CSS errors parser = cssutils.CSSParser(loglevel=100, fetcher=lambda url: (None, ""), validate=False) if not inline: sheet = parser.parseString(data) return list(cssutils.getUrls(sheet)) else: urls = [] declaration = parser.parseStyle(data) # prop = background, color, margin, ... for prop in declaration: # value = red, 10px, url(foobar), ... for value in prop.propertyValue: if isinstance(value, cssutils.css.URIValue): if value.uri: urls.append(value.uri) return urls
def iterlinks(self, name, get_line_numbers=True): """ Iterate over all links in name. If get_line_numbers is True the yields results of the form (link, line_number, offset). Where line_number is the line_number at which the link occurs and offset is the number of characters from the start of the line. Note that offset could actually encompass several lines if not zero. """ media_type = self.mime_map.get(name, guess_type(name)) if name == self.opf_name: for elem in self.opf_xpath("//*[@href]"): yield (elem.get("href"), elem.sourceline, 0) if get_line_numbers else elem.get("href") elif media_type.lower() in OEB_DOCS: for el, attr, link, pos in iterlinks(self.parsed(name)): yield (link, el.sourceline, pos) if get_line_numbers else link elif media_type.lower() in OEB_STYLES: if get_line_numbers: with self.open(name) as f: raw = self.decode(f.read()) for link, offset in itercsslinks(raw): yield link, 0, offset else: for link in getUrls(self.parsed(name)): yield link elif media_type.lower() == guess_type("toc.ncx"): for elem in self.parsed(name).xpath("//*[@src]"): yield (elem.get("src"), elem.sourceline, 0) if get_line_numbers else elem.get("src")
def _get_css_imports_cssutils(data, inline=False): """Return all assets that are referenced in the given CSS document. The returned URLs are relative to the stylesheet's URL. Args: data: The content of the stylesheet to scan as string. inline: True if the argument is a inline HTML style attribute. """ # We don't care about invalid CSS data, this will only litter the log # output with CSS errors parser = cssutils.CSSParser(loglevel=100, fetcher=lambda url: (None, ""), validate=False) if not inline: sheet = parser.parseString(data) return list(cssutils.getUrls(sheet)) else: urls = [] declaration = parser.parseStyle(data) # prop = background, color, margin, ... for prop in declaration: # value = red, 10px, url(foobar), ... for value in prop.propertyValue: if isinstance(value, cssutils.css.URIValue): if value.uri: urls.append(value.uri) return urls
def iterlinks(self, name, get_line_numbers=True): ''' Iterate over all links in name. If get_line_numbers is True the yields results of the form (link, line_number, offset). Where line_number is the line_number at which the link occurs and offset is the number of characters from the start of the line. Note that offset could actually encompass several lines if not zero. ''' media_type = self.mime_map.get(name, guess_type(name)) if name == self.opf_name: for elem in self.opf_xpath('//*[@href]'): yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href') elif media_type.lower() in OEB_DOCS: for el, attr, link, pos in iterlinks(self.parsed(name)): yield (link, el.sourceline, pos) if get_line_numbers else link elif media_type.lower() in OEB_STYLES: if get_line_numbers: with self.open(name, 'rb') as f: raw = self.decode(f.read()).replace('\r\n', '\n').replace( '\r', '\n') position = PositionFinder(raw) is_in_comment = CommentFinder(raw) for link, offset in itercsslinks(raw): if not is_in_comment(offset): lnum, col = position(offset) yield link, lnum, col else: for link in getUrls(self.parsed(name)): yield link elif media_type.lower() == guess_type('toc.ncx'): for elem in self.parsed(name).xpath('//*[@src]'): yield (elem.get('src'), elem.sourceline, 0) if get_line_numbers else elem.get('src')
def scrape_style(url, zipper): """ Scrape any instances of url(...) Args: url (str): url to css file zipper (html_writer): zip to write to Returns str of css style rules """ sheet = cssutils.parseUrl(url) rules = sheet.cssText.decode('utf-8') # Parse urls in css for url in cssutils.getUrls(sheet): try: # Download any urls in css to the shared asset directory (if not already there) filename = url.split('?')[0].split('/')[-1] filepath = os.path.sep.join([SHARED_ASSET_DIRECTORY, filename]) if not os.path.isfile(filepath): with open(filepath, 'wb') as fobj: fobj.write(read(url)) # Replace text with new url new_url = zipper.write_file(filepath, filename, directory="assets") rules = rules.replace(url, "../" + new_url) except requests.exceptions.HTTPError: LOGGER.warning("Could not download css url {}".format(url)) return rules
def acquire_css_files(html, soup, webpage_cursor, local_file_path, root_directory): #Find all css file locations for css_link in soup.find_all('link'): #Convert relative link to css file to an absolute link href = urlparse.urljoin(webpage_cursor, css_link['href']) if href not in css_url_list: css_url_list.append(href) current_directory = check_create_directory(href, local_file_path, root_directory) file_name = urlparse.urlparse(href).path.split('/')[-1] #Save this file to the directory request = urllib2.Request(href) try: responce = urllib2.urlopen(request) css = responce.read() except urllib2.URLError, e: raise Exception("%s returned an error: %s" % (href, e) ) sys.exit(0) modified_css_sheet = cssutils.parseString(css) resource_urls = set(cssutils.getUrls(modified_css_sheet)) modified_css_text = css print 'href_test: ', href file_depth = href.count('/') - 3 depth_relative_link_slashes = '../' * file_depth for url in resource_urls: if url.startswith('/'): modified_url = depth_relative_link_slashes + url[1:] modified_css_text = modified_css_text.replace(url, modified_url) print url print 'modified_url: ', modified_url #Iterate over all internal resources on each css file try: if file_name is '': file_loc = os.path.join(current_directory, 'application.css') elif file_name.endswith('.css'): file_loc = os.path.join(current_directory, file_name) else: file_loc = os.path.join(current_directory, (file_name + '.css') ) css_file = open(file_loc, 'w') css_file.write(modified_css_text) except IOError as e: print 'IO Write Error: %s'%e sys.exit(0) finally: css_file.close() print("%s cloned..." % href) #Clone all associated resources with this css file clone_all_css_resources(css, current_directory, webpage_cursor, root_directory)
def get_links_from_css(css_file, base_url): """Get all links from a CSS file.""" result = [] text = css_file.read() parsed = cssutils.parseString(text) all_urls = cssutils.getUrls(parsed) for url in all_urls: result.append(urljoin(base_url, url)) return result
def save_css_assets(self, path): project_path = self.get_project_path() file_name = path.replace(project_path, '') try: css = cssutils.parseFile(path) urls = cssutils.getUrls(css) except: self.error_files.append(file_name) return file_path = file_name.rsplit('/', 1)[0] for url in urls: if 'http' not in url and 'https' not in url: url = url.rsplit('/', 1) if len(url) == 1: asset = '/' + url[0] path = '' elif len(url) > 1: asset = '/' + url[1] path = '/' + url[0] else: continue if "../" in path: path_a = path.split("../") if path_a[-1] != '': sub_path = file_path.split('/') for i in range(len(path_a) - 1): sub_path = sub_path[:-1] path = '/' + path_a[-1] sub_path = '/'.join(sub_path) else: sub_path = file_path if sub_path.startswith('/'): sub_path = sub_path[1:] l = self.get_requestable_link(sub_path + path + asset) if l in self.visited_assets: continue r = self.do_request(l, stream=True, type=self.asset_request) if r.status_code == 200: file = asset.split('?')[0] full_path = self.get_dir(sub_path + path,True) if file.endswith('.css'): with open(full_path + file, "wb") as f: f.write(r.text.encode('utf-8')) f.close() self.save_css_assets(full_path + file) else: with open(full_path + file, "wb") as f: shutil.copyfileobj(r.raw, f)
def reviews(self, app_id, page=1): """Sends a POST request and retrieves a list of reviews for the specified app. :param app_id: the app to retrieve details from, e.g. 'com.nintendo.zaaa' :param page: the page number to retrieve; max is 10 :return: a list of reviews """ data = { 'reviewType': 0, 'pageNum': page, 'id': app_id, 'reviewSortOrder': 4, 'xhr': 1, 'hl': self.language } self.params['authuser'] = '******' response = send_request('POST', s.REVIEW_URL, data, self.params) content = response.text content = content[content.find('[["ecr"'):].strip() data = json.loads(content) html = data[0][2] soup = BeautifulSoup(html, 'lxml', from_encoding='utf8') reviews = [] for element in soup.select('.single-review'): review = {} avatar_style = element.select_one('.author-image').get('style') if avatar_style: sheet = cssutils.css.CSSStyleSheet() sheet.add('tmp { %s }' % avatar_style) review['author_image'] = list(cssutils.getUrls(sheet))[0] review_header = element.select_one('.review-header') review['review_id'] = review_header.get('data-reviewid', '') review['review_permalink'] = review_header.select_one('.reviews-permalink').get('href') review['author_name'] = review_header.select_one('.author-name').text review['review_date'] = review_header.select_one('.review-date').text curr_rating = review_header.select_one('.current-rating').get('style') review['current_rating'] = int(int(str(cssutils.parseStyle(curr_rating).width).replace('%', '')) / 20) body_elem = element.select_one('.review-body') review_title = body_elem.select_one('.review-title').extract() body_elem.select_one('.review-link').decompose() review['review_title'] = review_title.text review['review_body'] = body_elem.text reviews.append(review) return reviews
def get_body(self): while not self.new_urls.empty(): current_url = yield from self.new_urls.get() if current_url in self.visited_urls: continue self.visited_urls.append(current_url) if current_url.name: file_name = current_url.name elif current_url.raw_path != '/': file_name = current_url.path.rsplit('/')[1] else: file_name = 'index.html' file_path = os.path.dirname(current_url.path) if file_path == '/': file_path = self.target_path else: file_path = os.path.join(self.target_path, file_path[1:]) print('path: ', file_path, 'name: ', file_name) if file_path and not os.path.exists(file_path): os.makedirs(file_path) data = None try: with aiohttp.Timeout(10.0): with aiohttp.ClientSession() as session: response = yield from session.get(current_url) data = yield from response.read() except aiohttp.ClientError as client_error: print(client_error) else: response.release() session.close() if data is not None: if re.match(re.compile('.*\.(html|php)'), file_name): soup = yield from self.replace_links(data) data = str(soup).encode() with open(os.path.join(file_path, file_name), 'wb') as index_fh: index_fh.write(data) if '.css' in file_name: css = cssutils.parseString(data) for carved_url in cssutils.getUrls(css): if carved_url.startswith('data'): continue carved_url = yarl.URL(carved_url) if not carved_url.is_absolute(): carved_url = self.root.join(carved_url) if carved_url not in self.visited_urls: yield from self.new_urls.put(carved_url)
def __call__(self, oeb, context): import cssutils oeb.logger.info('Trimming unused files from manifest...') self.opts = context used = set() for term in oeb.metadata: for item in oeb.metadata[term]: if item.value in oeb.manifest.hrefs: used.add(oeb.manifest.hrefs[item.value]) elif item.value in oeb.manifest.ids: used.add(oeb.manifest.ids[item.value]) for ref in oeb.guide.values(): path, _ = urldefrag(ref.href) if path in oeb.manifest.hrefs: used.add(oeb.manifest.hrefs[path]) # TOC items are required to be in the spine for item in oeb.spine: used.add(item) unchecked = used while unchecked: new = set() for item in unchecked: if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')) and \ item.data is not None: hrefs = [r[2] for r in iterlinks(item.data)] for href in hrefs: if isinstance(href, bytes): href = href.decode('utf-8') try: href = item.abshref(urlnormalize(href)) except: continue if href in oeb.manifest.hrefs: found = oeb.manifest.hrefs[href] if found not in used: new.add(found) elif item.media_type == CSS_MIME: for href in cssutils.getUrls(item.data): href = item.abshref(urlnormalize(href)) if href in oeb.manifest.hrefs: found = oeb.manifest.hrefs[href] if found not in used: new.add(found) used.update(new) unchecked = new for item in oeb.manifest.values(): if item not in used: oeb.logger.info('Trimming %r from manifest' % item.href) oeb.manifest.remove(item)
def process_resource(self, content, url): try: sheet = cssutils.parseString(content, href=url) except Exception: log.warn("Error parsing %s", url) return linked_fetchables = [] for img_url in cssutils.getUrls(sheet): #Ignore data: uris if not img_url.startswith("data:"): linked_fetchables.append(Image(urlparse.urljoin(url, img_url), self.sfh)) return linked_fetchables
async def get_body(self, session): while not self.new_urls.empty(): print(animation[self.itr % len(animation)], end="\r") self.itr = self.itr + 1 current_url, level = await self.new_urls.get() if current_url.human_repr() in self.visited_urls: continue self.visited_urls.append(current_url.human_repr()) file_name, hash_name = self._make_filename(current_url) self.logger.debug('Cloned file: %s', file_name) data = None content_type = None try: response = await session.get(current_url, headers={'Accept': 'text/html'}, timeout=10.0) headers = self.get_headers(response) content_type = response.content_type data = await response.read() except (aiohttp.ClientError, asyncio.TimeoutError) as client_error: self.logger.error(client_error) else: await response.release() if data is not None: self.meta[file_name]['hash'] = hash_name self.meta[file_name]['headers'] = headers if (response.status in [401, 403]): self.setting["auth_list"].append(file_name) self.counter = self.counter + 1 if content_type == 'text/html': soup = await self.replace_links(data, level) data = str(soup).encode() elif content_type == 'text/css': css = cssutils.parseString(data, validate=self.css_validate) for carved_url in cssutils.getUrls(css): if carved_url.startswith('data'): continue carved_url = yarl.URL(carved_url) if not carved_url.is_absolute(): carved_url = self.root.join(carved_url) if carved_url.human_repr() not in self.visited_urls: await self.new_urls.put((carved_url, level + 1)) with open(os.path.join(self.target_path, hash_name), 'wb') as index_fh: index_fh.write(data)
async def crawler(client, url_queue, archive): while True: url = await url_queue.get() try: log.debug(url) headers = ACCEPT_HEADERS headers['Referer'] = archive['top'] response = await client.get(url, headers=headers) if response.status != 200: log.warn('BAD RESPONSE: {}: {}'.format(response.status, url)) else: data = await response.read() content_type, params = parse_header(response.headers['content-type']) item = { "WebResourceData": data, "WebResourceMIMEType": content_type, "WebResourceURL": url } if 'charset' in params: item['WebResourceTextEncodingName'] = params['charset'] # TODO: attempt to reproduce the way HTTP headers are stored (NSKeyedArchiver?) archive['items'].append(item) archive['seen'][url] = True if 'text/html' == content_type: dom = html.fromstring(data) patterns = ['//img/@src', '//img/@data-src', '//img/@data-src-retina', '//script/@src', "//link[@rel='stylesheet']/@href"] for path in patterns: for attr in dom.xpath(path): log.debug("{}: {} {}".format(path, url, attr)) url = unquote(urljoin(url, urldefrag(attr)[0])) if url not in archive['seen']: archive['seen'][url] = True await url_queue.put(url) elif 'text/css' == content_type: # TODO: nested @import and better path inference for attr in getUrls(parseString(data)): log.debug(attr) url = unquote(urljoin(url, urldefrag(attr)[0])) if url not in archive['seen']: archive['seen'][url] = True await url_queue.put(url) except Exception as exc: log.warn('Exception {}:'.format(exc), exc_info=True) finally: url_queue.task_done()
async def get_body(self, session): while not self.new_urls.empty(): current_url, level = await self.new_urls.get() if current_url.human_repr() in self.visited_urls: continue self.visited_urls.append(current_url.human_repr()) file_name, hash_name = self._make_filename(current_url) print('name: ', file_name) self.meta[file_name] = {} data = None content_type = None try: response = await session.get(current_url, headers={'Accept': 'text/html'}, timeout=10.0) content_type = response.content_type data = await response.read() except (aiohttp.ClientError, asyncio.TimeoutError) as client_error: self.logger.error(client_error) else: await response.release() if data is not None: self.meta[file_name]['hash'] = hash_name self.meta[file_name]['content_type'] = content_type if content_type == 'text/html': soup = await self.replace_links(data, level) data = str(soup).encode() with open(os.path.join(self.target_path, hash_name), 'wb') as index_fh: index_fh.write(data) if content_type == 'text/css': css = cssutils.parseString(data, validate=self.css_validate) for carved_url in cssutils.getUrls(css): if carved_url.startswith('data'): continue carved_url = yarl.URL(carved_url) if not carved_url.is_absolute(): carved_url = self.root.join(carved_url) if carved_url.human_repr() not in self.visited_urls: await self.new_urls.put((carved_url, level + 1))
def test_getUrls(self): "cssutils.getUrls()" cssutils.ser.prefs.keepAllProperties = True css = ''' @import "im1"; @import url(im2); @import url( im3 ); @import url( "im4" ); @import url( 'im5' ); a { background-image: url(c) !important; background-\image: url(b); background: url(a) no-repeat !important; }''' s = cssutils.parseString(css) urls = set(cssutils.getUrls(s)) self.assertEqual( urls, set(["im1", "im2", "im3", "im4", "im5", "c", "b", "a"])) cssutils.ser.prefs.keepAllProperties = False
def test_getUrls(self): "cssutils.getUrls()" cssutils.ser.prefs.keepAllProperties = True css=''' @import "im1"; @import url(im2); @import url( im3 ); @import url( "im4" ); @import url( 'im5' ); a { background-image: url(c) !important; background-\image: url(b); background: url(a) no-repeat !important; }''' s = cssutils.parseString(css) urls = set(cssutils.getUrls(s)) self.assertEqual(urls, set(["im1", "im2", "im3", "im4", "im5", "c", "b", "a"])) cssutils.ser.prefs.keepAllProperties = False
def css_download_url_refs(root_url, url_parts, sheet, dst_folder): """ download images etc referenced in the css file """ tmp_url_parts = list(urlparse.urlparse(deepcopy(root_url))) # url_parts url_from_tmp_url_parts = urlparse.urlunparse(tmp_url_parts) urls = cssutils.getUrls(sheet) for url in urls: if url.startswith("/"): file_name = url.split("/")[-1] file_name = sanitize_file_name(file_name) new_src_url = sanitize_url(list(urlparse.urlparse(url))[2]) new_src = create_directories(dst_folder, new_src_url) full_path = os.path.join(dst_folder, new_src) outpath = os.path.join(full_path, file_name) path = root_url + url if file_name != "": logging.debug("downloading css reference " + file_name + "..." + path) if url.lower().startswith("http"): download_file(url, outpath) else: download_file(path, outpath) else: file_name = url.split("/")[-1] file_name = sanitize_file_name(file_name) new_src_url = sanitize_url(list(urlparse.urlparse(url))[2]) new_src = create_directories(dst_folder, new_src_url) full_path = os.path.join(dst_folder, new_src) outpath = os.path.join(full_path, file_name) path = root_url.replace(file_name, "") + url if file_name != "": logging.debug("downloading css reference " + file_name + "...") if url.lower().startswith("http"): download_file(url, outpath) else: download_file(path, outpath)
def show_urls(s, data): stylesheet = cssutils.parseString(s) # parseFile (f) # its a start :) #print [u for u in cssutils.getUrls (stylesheet)] for u in cssutils.getUrls(stylesheet): print u
def show_urls (s, data): stylesheet = cssutils.parseString (s) # parseFile (f) # its a start :) #print [u for u in cssutils.getUrls (stylesheet)] for u in cssutils.getUrls (stylesheet): print u
def geturls(stylesheet): """ Return a list of all URLs appearing in the :class:`CSSStyleSheet` :obj:`stylesheet`. """ return [url.URL(u) for u in cssutils.getUrls(stylesheet)] # This requires cssutils 0.9.5b1
def get_media_requests(self, item, info): sheet = CSSStyleSheet() sheet.cssText = item['content'] urls = cssutils.getUrls(sheet) return [Request(u) for u in urls]
for i in range(len(a)): directory = a[i]['href'] if(".css" not in directory): print("-------Skipped for ---------",directory) continue if "http" in directory or "https" in directory: print ("------Skipped for ----- ",directory) continue print ('\t[+]Getting CSS = '+str(directory)) if "/" not in directory: print ("\tNo directory. Saving file",directory) elif not os.path.exists(os.path.dirname(directory)): print (" [DIR]Creating directory") os.makedirs(os.path.dirname(directory)) testfile, headers = urlretrieve(baseurl+directory, directory, reporthook=report) urls = list( cssutils.getUrls(cssutils.parseFile(directory))) if(len(urls)!=0): for link in urls: try: if "http" in directory or "https" in link or "data:image/" in link: print ("------Skipped for ----- ",link) continue while("../" in link): if("assets" in link): link = link[3:] else: link = "assets/"+link[3:] print ('\t\t[+]Getting CSS-Image = '+str(link)) if "/" not in link: print ("\t\tNo directory. Saving file",link) elif not os.path.exists(os.path.dirname(link)):
def get_body(self, root_url, urls, visited_urls): if not root_url.startswith("http"): root_url = 'http://' + root_url visited_urls.append(root_url) parsed_url = urlparse(root_url) if parsed_url.fragment: return domain = parsed_url.netloc if not domain.endswith('/'): domain += '/' file_name = self.make_new_link(root_url) file_path = '' patt = '/.*/.*\.' if re.match(patt, file_name): file_path, file_name = file_name.rsplit('/', 1) file_path += '/' print('path: ', file_path, 'name: ', file_name) if len(domain) < 4: sys.exit('invalid taget {}'.format(root_url)) page_path = '/opt/snare/pages/{}'.format(domain) if not os.path.exists(page_path): os.mkdir(page_path) if file_path and not os.path.exists(page_path + file_path): os.makedirs(page_path + file_path) data = None try: with aiohttp.Timeout(10.0): with aiohttp.ClientSession() as session: response = yield from session.get(root_url) data = yield from response.read() except Exception as e: print(e) else: response.release() session.close() if data is not None: if re.match(re.compile('.*\.(html|php)'), file_name): soup = self.replace_links(data, domain, urls) data = str(soup).encode() with open(page_path + file_path + file_name, 'wb') as index_fh: index_fh.write(data) if '.css' in file_name: css = cssutils.parseString(data) for carved_url in cssutils.getUrls(css): if carved_url.startswith('data'): continue carved_url = os.path.normpath( os.path.join(domain, carved_url)) if not carved_url.startswith('http'): if carved_url.startswith( '..') or carved_url.startswith('/'): carved_url = 'http://' + domain + carved_url else: carved_url = 'http://' + carved_url if carved_url not in visited_urls: urls.insert(0, carved_url) for url in urls: urls.remove(url) if url in visited_urls: continue yield from self.get_body(url, urls, visited_urls)
def reviews(self, app_id_list, page=0): #print("In Scraper - reviews def \n") reviews_adder = [] for n in range(len(app_id_list)): #app_id=app_id_list[n] #print(app_id) data = { 'reviewType': 0, 'pageNum': page, 'id': app_id_list[n], 'reviewSortOrder': 4, 'xhr': 1, 'hl': self.language } self.params['authuser'] = '******' #print('before send request') #print(app_id_list[n]) response = send_request('POST', s.REVIEW_URL, data, self.params) content = response.text content = content[content.find('[["ecr"'):].strip() data = json.loads(content) #print(data) html = data[0][2] soup = BeautifulSoup(html, 'lxml', from_encoding='utf8') #print(soup) reviews = [] for element in soup.select('.single-review'): #print('Inside single review') review = {} #print("In Scraper - reviews def- rev_app_id:: \n") #print(app_id) review['rev_app_id'] = app_id_list[n] avatar_style = element.select_one('.author-image').get('style') #print(avatar_style) if avatar_style: sheet = cssutils.css.CSSStyleSheet() sheet.add('tmp { %s }' % avatar_style) review['author_image'] = list(cssutils.getUrls(sheet))[0] review_header = element.select_one('.review-header') review['review_id'] = review_header.get('data-reviewid', '') review['review_permalink'] = review_header.select_one( '.reviews-permalink').get('href') review['author_name'] = review_header.select_one( '.author-name').text review['review_date'] = review_header.select_one( '.review-date').text curr_rating = review_header.select_one('.current-rating').get( 'style') review['current_rating'] = int( int( str(cssutils.parseStyle(curr_rating).width).replace( '%', '')) / 20) body_elem = element.select_one('.review-body') review_title = body_elem.select_one('.review-title').extract() body_elem.select_one('.review-link').decompose() review['review_title'] = review_title.text review['review_body'] = body_elem.text reviews.append(review) reviews_adder.append(review) data = '' return reviews_adder
def run(self): # Step 0: ensure that the document_root and base_path variables are # set. If the file that's being processed was inside a source that has # either one or both not set, then this processor can't run. if self.document_root is None or self.base_path is None: raise DocumentRootAndBasePathRequiredException # We don't rename the file, so we can use the default output file. parser = CSSParser(log=None, loglevel=logging.critical) sheet = parser.parseFile(self.input_file) # Step 1: ensure the file has URLs. If it doesn't, we can stop the # processing. url_count = 0 for url in getUrls(sheet): url_count += 1 break if url_count == 0: return self.input_file # Step 2: resolve the relative URLs to absolute paths. replaceUrls(sheet, self.resolveToAbsolutePath) # Step 3: verify that each of these files has been synced. synced_files_db = urljoin(sys.path[0] + os.sep, SYNCED_FILES_DB) self.dbcon = sqlite3.connect(synced_files_db) self.dbcur = self.dbcon.cursor() all_synced = True for urlstring in getUrls(sheet): # Skip absolute URLs. if urlstring.startswith("http://") or urlstring.startswith( "https://"): continue # Skip broken references in the CSS file. This would otherwise # prevent this CSS file from ever passing through this processor. if not os.path.exists(urlstring): continue # Get the CDN URL for the given absolute path. self.dbcur.execute( "SELECT url FROM synced_files WHERE input_file=?", (urlstring, )) result = self.dbcur.fetchone() if result == None: raise RequestToRequeueException( "The file '%s' has not yet been synced to the server '%s'" % (urlstring, self.process_for_server)) else: cdn_url = result[0] # Step 4: resolve the absolute paths to CDN URLs. replaceUrls(sheet, self.resolveToCDNURL) # Step 5: write the updated CSS to the output file. f = open(self.output_file, 'w') f.write(sheet.cssText) f.close() return self.output_file
def web_crawler(url, depth=0, page_assets=False): if depth >= 0: opener = request.build_opener() opener.add_headers = [{'User-Agent': 'Mozilla'}] request.install_opener(opener) base_url = "{0.scheme}://{0.netloc}/".format(parse.urlsplit(url)) if url not in links: links.append(url) raw = requests.get(url).text if page_assets: try: sheet = cssutils.parseString(requests.get(url).content) urls = cssutils.getUrls(sheet) for url in urls: if url not in links: links.append(url) path = request.urlopen(url) meta = path.info() print(url, ' size: ', meta.get(name="Content-Length")) except: pass soup = bs(raw, 'html.parser') for script in soup.find_all("script"): if script.attrs.get("src"): script_url = parse.urljoin(url, script.attrs.get("src")) if script_url not in assets: path = request.urlopen(script_url) meta = path.info() print(script_url, ' size: ', meta.get(name="Content-Length")) assets.append(script_url) if page_assets and script_url not in links: links.append(script_url) web_crawler(script_url, depth - 1, page_assets) for css in soup.find_all("link", {"rel": "stylesheet"}): if css.attrs.get("href"): css_url = parse.urljoin(url, css.attrs.get("href")) if css_url not in assets: try: path = request.urlopen(css_url) meta = path.info() print(css_url, ' ', 'size: ', meta.get(name="Content-Length")) assets.append(css_url) if page_assets and css_url not in links: links.append(css_url) web_crawler(css_url, depth - 1, page_assets) except: pass for img in soup.find_all("img"): if img.get("src"): img_url = parse.urljoin(url, img.get("src")) try: path = request.urlopen(img_url) meta = path.info() if img_url not in assets: print(img_url, ' ', 'size: ', meta.get(name="Content-Length")) assets.append(img_url) except: pass for a in soup.find_all('a'): href = str(a.get('href')) if 'http://' not in href and 'https://' not in href and base_url not in href: href = base_url + href[1:] if href not in links: path = request.urlopen(href) meta = path.info() print(href, ' ', 'size: ', meta.get(name="Content-Length")) links.append(href) web_crawler(href, depth - 1, page_assets)
def _manifest_add_missing(self, invalid): import cssutils manifest = self.oeb.manifest known = set(manifest.hrefs) unchecked = set(manifest.values()) cdoc = OEB_DOCS|OEB_STYLES invalid = set() while unchecked: new = set() for item in unchecked: data = None if (item.media_type in cdoc or item.media_type[-4:] in ('/xml', '+xml')): try: data = item.data except: self.oeb.log.exception(u'Failed to read from manifest ' u'entry with id: %s, ignoring'%item.id) invalid.add(item) continue if data is None: continue if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')): hrefs = [r[2] for r in iterlinks(data)] for href in hrefs: if isinstance(href, bytes): href = href.decode('utf-8') href, _ = urldefrag(href) if not href: continue try: href = item.abshref(urlnormalize(href)) scheme = urlparse(href).scheme except: self.oeb.log.exception( 'Skipping invalid href: %r'%href) continue if not scheme and href not in known: new.add(href) elif item.media_type in OEB_STYLES: try: urls = list(cssutils.getUrls(data)) except: urls = [] for url in urls: href, _ = urldefrag(url) href = item.abshref(urlnormalize(href)) scheme = urlparse(href).scheme if not scheme and href not in known: new.add(href) unchecked.clear() warned = set([]) for href in new: known.add(href) is_invalid = False for item in invalid: if href == item.abshref(urlnormalize(href)): is_invalid = True break if is_invalid: continue if not self.oeb.container.exists(href): if href not in warned: self.logger.warn('Referenced file %r not found' % href) warned.add(href) continue if href not in warned: self.logger.warn('Referenced file %r not in manifest' % href) warned.add(href) id, _ = manifest.generate(id='added') guessed = guess_type(href)[0] media_type = guessed or BINARY_MIME added = manifest.add(id, href, media_type) unchecked.add(added) for item in invalid: self.oeb.manifest.remove(item)
def get_gumroad_icon_url(data): soup = BeautifulSoup(data, 'html.parser') sheet = cssutils.css.CSSStyleSheet() sheet.add("dummy_selector { %s }" % soup.select_one( '.profile-picture-medium.js-profile-picture').get('style')) return list(cssutils.getUrls(sheet))[0]