Example #1
0
    def run(self):
        # Step 0: ensure that the document_root and base_path variables are
        # set. If the file that's being processed was inside a source that has
        # either one or both not set, then this processor can't run.
        if self.document_root is None or self.base_path is None:
            raise DocumentRootAndBasePathRequiredException

        # We don't rename the file, so we can use the default output file.

        parser = CSSParser(log=None, loglevel=logging.CRITICAL)
        sheet = parser.parseFile(self.input_file)

        # Step 1: ensure the file has URLs. If it doesn't, we can stop the
        # processing.
        url_count = 0
        for url in getUrls(sheet):
            url_count += 1
            break
        if url_count == 0:
            return self.input_file

        # Step 2: resolve the relative URLs to absolute paths.
        replaceUrls(sheet, self.resolveToAbsolutePath)

        # Step 3: verify that each of these files has been synced.
        synced_files_db = urljoin(sys.path[0] + os.sep, SYNCED_FILES_DB)
        self.dbcon = sqlite3.connect(synced_files_db)
        self.dbcon.text_factory = unicode  # This is the default, but we set it explicitly, just to be sure.
        self.dbcur = self.dbcon.cursor()
        all_synced = True
        for urlstring in getUrls(sheet):
            # Skip absolute URLs.
            if urlstring.startswith("http://") or urlstring.startswith("https://"):
                continue

            # Skip broken references in the CSS file. This would otherwise
            # prevent this CSS file from ever passing through this processor.
            if not os.path.exists(urlstring):
                continue

            # Get the CDN URL for the given absolute path.
            self.dbcur.execute("SELECT url FROM synced_files WHERE input_file=?", (urlstring,))
            result = self.dbcur.fetchone()

            if result == None:
                raise RequestToRequeueException(
                    "The file '%s' has not yet been synced to the server '%s'" % (urlstring, self.process_for_server)
                )
            else:
                cdn_url = result[0]

        # Step 4: resolve the absolute paths to CDN URLs.
        replaceUrls(sheet, self.resolveToCDNURL)

        # Step 5: write the updated CSS to the output file.
        f = open(self.output_file, "w")
        f.write(sheet.cssText)
        f.close()

        return self.output_file
def clone_all_css_resources(css, current_directory, root_url, root_directory):
    #Clone all resources associated with each url
    css_sheet = cssutils.parseString(css)
    resource_urls = cssutils.getUrls(css_sheet)

    for url in resource_urls:
        if url not in resource_url_list:
            resource_url_list.append(url)

            #Create any required new directories for this url
            url = urlparse.urljoin(root_url, url)
            css_directory = check_create_directory(url, current_directory, root_directory)

            file_name = urlparse.urlparse(url).path.split('/')[-1]

            #Save this file to the directory
            try:
                output_file_directory = os.path.join(css_directory, file_name)
                urllib.urlretrieve(url, output_file_directory)
            except:
                print 'io error'
                print url
                print output_file_directory
                #raise Exception("UrlLib Error: writing file %s" % os.path.join(css_directory, file_name) )
                sys.exit(0)

            print("%s cloned..." % url)
Example #3
0
 def get_links_from_css(self, style_text, item):
     '''
         This function extracts urls from css style text
         and returns requests for download thees images.
         Also in this function we are replacing urls to
         absolute uri to replace it by local url
     '''
     response = item['response']
     sheet = CSSStyleSheet()
     sheet.cssText = style_text
     urls = cssutils.getUrls(sheet)
     requests = []
     item_content = item['content']
     for url in urls:
         request_url = response.url.replace('http://', '')
         if url[0] == '/':
             request_url = request_url.split('/')[0] + url
         else:
             request_url = request_url.split('/')
             request_url[-1] = url
             request_url = '/'.join(request_url)
         request_url = 'http://%s' % request_url
         item_content = item_content.replace(url, request_url)
         requests.append(Request(request_url))
     item['content'] = item_content
     return requests
Example #4
0
    def process(self):
        if 'fonts' in self.link:  # Omit google fonts
            self.tag.decompose()
            return

        # Parse urls in css (using parseString because it is much faster than parseUrl)
        style_sheet = downloader.read(self.link).decode('utf-8-sig',
                                                        errors='ignore')
        sheet = cssutils.parseString(style_sheet)
        for css_url in cssutils.getUrls(sheet):
            if not css_url.startswith('data:image') and not css_url.startswith(
                    'data:application'):
                try:
                    style_sheet = style_sheet.replace(
                        css_url,
                        os.path.basename(
                            self.write_url(css_url,
                                           url=self.link,
                                           default_ext='.png')))
                except BROKEN_EXCEPTIONS as e:
                    LOGGER.warn(
                        'Unable to download stylesheet url at {} ({})'.format(
                            self.url, str(e)))

        self.tag[self.attribute] = self.format_url(
            self.write_contents(self.get_filename(self.link), style_sheet))
        return self.tag[self.attribute]
Example #5
0
 def iterlinks(self, name, get_line_numbers=True):
     ''' Iterate over all links in name. If get_line_numbers is True the
     yields results of the form (link, line_number, offset). Where
     line_number is the line_number at which the link occurs and offset is
     the number of characters from the start of the line. Note that offset
     could actually encompass several lines if not zero. '''
     media_type = self.mime_map.get(name, guess_type(name))
     if name == self.opf_name:
         for elem in self.opf_xpath('//*[@href]'):
             yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href')
     elif media_type.lower() in OEB_DOCS:
         for el, attr, link, pos in iterlinks(self.parsed(name)):
             yield (link, el.sourceline, pos) if get_line_numbers else link
     elif media_type.lower() in OEB_STYLES:
         if get_line_numbers:
             with self.open(name, 'rb') as f:
                 raw = self.decode(f.read()).replace('\r\n', '\n').replace('\r', '\n')
                 position = PositionFinder(raw)
                 is_in_comment = CommentFinder(raw)
                 for link, offset in itercsslinks(raw):
                     if not is_in_comment(offset):
                         lnum, col = position(offset)
                         yield link, lnum, col
         else:
             for link in getUrls(self.parsed(name)):
                 yield link
     elif media_type.lower() == guess_type('toc.ncx'):
         for elem in self.parsed(name).xpath('//*[@src]'):
             yield (elem.get('src'), elem.sourceline, 0) if get_line_numbers else elem.get('src')
def geturls(stylesheet):
    """
	Return a list of all URLs appearing in the :class:`CSSStyleSheet`
	:obj:`stylesheet`.
	"""
    return [url.URL(u) for u in cssutils.getUrls(stylesheet)
            ]  # This requires cssutils 0.9.5b1
Example #7
0
 def parse(self, response):
     items = ScraphubItem()
     tempfolder = tempfile.TemporaryDirectory(dir  =  self.service_config['temp_dir'])
     for deal_ in response.css('div.panel-body'):
         metadata = {}
         metadata['source'] = self.start_urls
         url = None
         image_name = None
         ccssrc_ = deal_.css('div.deal-image').get()
         if ccssrc_ is not None:
             soup = BeautifulSoup(ccssrc_)
             div_style = soup.find('div')['style']
             sheet = cssutils.css.CSSStyleSheet()
             sheet.add("dummy_selector { %s }" % div_style)
             url = list(cssutils.getUrls(sheet))[0]
             image_name = os.path.basename(urlparse(url).path)
             opener = urllib.request.URLopener()
             opener.addheader('User-Agent', 'Mozilla/5.0')
             filename, headers = opener.retrieve(url, '%s/'%(tempfolder.name) + image_name) 
             BucketUploader.upload_blob('%s/'%(tempfolder.name) + image_name, self.service_config['bucket_PATH'] + str(image_name))
             items['description'] = deal_.css('a::text').get()
             items['discount'] = deal_.css('div.deal-discount::text').get()
             items['imgurl'] = self.service_config['bucket_baseURL'] + str(image_name)
             items['base_amount'] = deal_.css('small::text').get() #recorded as a string this need to fixed for intiger
             items['source_href'] = GlobalVariable.urlRegx(deal_.css('a').get()) # This need to handel properly
             items['fetched_date'] = datetime.datetime.today()
             # print(deal_.css('div.col-sm-3').get())
             # metadata['expire'] = deal_.css('span.expirydate::text').get()
             items['meta'] =  json.dumps(metadata)
             items['is_automated'] = True
             items['postid'] = str(uuid.uuid1()) 
             items['expire_date'] = datetime.datetime.today()
             # items['current_amount'] = deal_.css('span::text').get()
             yield items
     tempfolder.cleanup()
Example #8
0
 def process(self, files):
     for f in files:
         if not f['resource_path'].endswith('.css') or f['type'] != 'file':
             yield f
             continue
         self._counter += 1
         fs_rpath = f['filesystem_path']
         sheet = cssutils.parseFile(fs_rpath)
         sheet.setSerializer(self.serializer)
         for url in cssutils.getUrls(sheet):
             u = urlparse(url)
             if u.scheme or u.netloc or not u.path.startswith('./'):
                 logging.warning('non-relative URL used in CSS: %s' % url)
         if self.resolve_imports:
             sheet = cssutils.resolveImports(sheet)
         target = os.path.join(
             self._tmpdir,
             str(self._counter) + '-' + os.path.basename(fs_rpath))
         out_f = open(target, 'wb')
         try:
             out_f.write(sheet.cssText)
         finally:
             out_f.close()
         f['filesystem_path'] = target
         yield f
Example #9
0
    def test_getUrls(self):
        "cssutils.getUrls()"
        cssutils.ser.prefs.keepAllProperties = True

        css = '''
        @import "im1";
        @import url(im2);
        @import url( im3 );
        @import url( "im4" );
        @import url( 'im5' );
        a {
            background-image: url(a) !important;
            background-\image: url(b);
            background: url(c) no-repeat !important;
            /* issue #46 */
            src: local("xx"),
                 url("f.woff") format("woff"),
                 url("f.otf") format("opentype"),
                 url("f.svg#f") format("svg");
            }'''
        urls = set(cssutils.getUrls(cssutils.parseString(css)))
        self.assertEqual(
            urls,
            set([
                "im1", "im2", "im3", "im4", "im5", "a", "b", "c", u'f.woff',
                u'f.svg#f', u'f.otf'
            ]))
        cssutils.ser.prefs.keepAllProperties = False
Example #10
0
 def get_links_from_css(self, style_text, item):
     '''
         This function extracts urls from css style text
         and returns requests for download thees images.
         Also in this function we are replacing urls to
         absolute uri to replace it by local url
     '''
     response = item['response']
     sheet = CSSStyleSheet()
     sheet.cssText = style_text
     urls = cssutils.getUrls(sheet)
     requests = []
     item_content = item['content']
     for url in urls:
         request_url = response.url.replace('http://', '')
         if url[0] == '/':
             request_url = request_url.split('/')[0] + url
         else:
             request_url = request_url.split('/')
             request_url[-1] = url
             request_url = '/'.join(request_url)
         request_url = 'http://%s' % request_url
         item_content = item_content.replace(url, request_url)
         requests.append(Request(request_url))
     item['content'] = item_content
     return requests
Example #11
0
    def test_getUrls(self):
        "cssutils.getUrls()"
        cssutils.ser.prefs.keepAllProperties = True

        css='''
        @import "im1";
        @import url(im2);
        @import url( im3 );
        @import url( "im4" );
        @import url( 'im5' );
        a {
            background-image: url(a) !important;
            background-\image: url(b);
            background: url(c) no-repeat !important;
            /* issue #46 */
            src: local("xx"),
                 url("f.woff") format("woff"),
                 url("f.otf") format("opentype"),
                 url("f.svg#f") format("svg");
            }'''
        urls = set(cssutils.getUrls(cssutils.parseString(css)))
        self.assertEqual(urls, set(["im1", "im2", "im3", "im4", "im5",
                                    "a", "b", "c",
                                    u'f.woff', u'f.svg#f', u'f.otf']))
        cssutils.ser.prefs.keepAllProperties = False
Example #12
0
def _get_css_imports_cssutils(data, inline=False):
    """Return all assets that are referenced in the given CSS document.

    The returned URLs are relative to the stylesheet's URL.

    Args:
        data: The content of the stylesheet to scan as string.
        inline: True if the argument is an inline HTML style attribute.
    """
    try:
        import cssutils
    except (ImportError, re.error):
        # Catching re.error because cssutils in earlier releases (<= 1.0) is
        # broken on Python 3.5
        # See https://bitbucket.org/cthedot/cssutils/issues/52
        return None

    # We don't care about invalid CSS data, this will only litter the log
    # output with CSS errors
    parser = cssutils.CSSParser(loglevel=100,
                                fetcher=lambda url: (None, ""), validate=False)
    if not inline:
        sheet = parser.parseString(data)
        return list(cssutils.getUrls(sheet))
    else:
        urls = []
        declaration = parser.parseStyle(data)
        # prop = background, color, margin, ...
        for prop in declaration:
            # value = red, 10px, url(foobar), ...
            for value in prop.propertyValue:
                if isinstance(value, cssutils.css.URIValue):
                    if value.uri:
                        urls.append(value.uri)
        return urls
Example #13
0
 def iterlinks(self, name, get_line_numbers=True):
     """ Iterate over all links in name. If get_line_numbers is True the
     yields results of the form (link, line_number, offset). Where
     line_number is the line_number at which the link occurs and offset is
     the number of characters from the start of the line. Note that offset
     could actually encompass several lines if not zero. """
     media_type = self.mime_map.get(name, guess_type(name))
     if name == self.opf_name:
         for elem in self.opf_xpath("//*[@href]"):
             yield (elem.get("href"), elem.sourceline, 0) if get_line_numbers else elem.get("href")
     elif media_type.lower() in OEB_DOCS:
         for el, attr, link, pos in iterlinks(self.parsed(name)):
             yield (link, el.sourceline, pos) if get_line_numbers else link
     elif media_type.lower() in OEB_STYLES:
         if get_line_numbers:
             with self.open(name) as f:
                 raw = self.decode(f.read())
                 for link, offset in itercsslinks(raw):
                     yield link, 0, offset
         else:
             for link in getUrls(self.parsed(name)):
                 yield link
     elif media_type.lower() == guess_type("toc.ncx"):
         for elem in self.parsed(name).xpath("//*[@src]"):
             yield (elem.get("src"), elem.sourceline, 0) if get_line_numbers else elem.get("src")
Example #14
0
def _get_css_imports_cssutils(data, inline=False):
    """Return all assets that are referenced in the given CSS document.

    The returned URLs are relative to the stylesheet's URL.

    Args:
        data: The content of the stylesheet to scan as string.
        inline: True if the argument is a inline HTML style attribute.
    """
    # We don't care about invalid CSS data, this will only litter the log
    # output with CSS errors
    parser = cssutils.CSSParser(loglevel=100,
                                fetcher=lambda url: (None, ""),
                                validate=False)
    if not inline:
        sheet = parser.parseString(data)
        return list(cssutils.getUrls(sheet))
    else:
        urls = []
        declaration = parser.parseStyle(data)
        # prop = background, color, margin, ...
        for prop in declaration:
            # value = red, 10px, url(foobar), ...
            for value in prop.propertyValue:
                if isinstance(value, cssutils.css.URIValue):
                    if value.uri:
                        urls.append(value.uri)
        return urls
Example #15
0
 def process(self, files):
     for f in files:
         if not f['resource_path'].endswith('.css') or f['type'] != 'file':
             yield f
             continue
         self._counter += 1
         fs_rpath = f['filesystem_path']
         sheet = cssutils.parseFile(fs_rpath)
         sheet.setSerializer(self.serializer)
         for url in cssutils.getUrls(sheet):
             u = urlparse(url)
             if u.scheme or u.netloc or not u.path.startswith('./'):
                 logging.warning('non-relative URL used in CSS: %s' % url)
         if self.resolve_imports:
             sheet = cssutils.resolveImports(sheet)
         target = os.path.join(
                 self._tmpdir,
                 str(self._counter) + '-' + os.path.basename(fs_rpath))
         out_f = open(target, 'wb')
         try:
             out_f.write(sheet.cssText)
         finally:
             out_f.close()
         f['filesystem_path'] = target
         yield f
Example #16
0
 def iterlinks(self, name, get_line_numbers=True):
     ''' Iterate over all links in name. If get_line_numbers is True the
     yields results of the form (link, line_number, offset). Where
     line_number is the line_number at which the link occurs and offset is
     the number of characters from the start of the line. Note that offset
     could actually encompass several lines if not zero. '''
     media_type = self.mime_map.get(name, guess_type(name))
     if name == self.opf_name:
         for elem in self.opf_xpath('//*[@href]'):
             yield (elem.get('href'), elem.sourceline,
                    0) if get_line_numbers else elem.get('href')
     elif media_type.lower() in OEB_DOCS:
         for el, attr, link, pos in iterlinks(self.parsed(name)):
             yield (link, el.sourceline, pos) if get_line_numbers else link
     elif media_type.lower() in OEB_STYLES:
         if get_line_numbers:
             with self.open(name, 'rb') as f:
                 raw = self.decode(f.read()).replace('\r\n', '\n').replace(
                     '\r', '\n')
                 position = PositionFinder(raw)
                 is_in_comment = CommentFinder(raw)
                 for link, offset in itercsslinks(raw):
                     if not is_in_comment(offset):
                         lnum, col = position(offset)
                         yield link, lnum, col
         else:
             for link in getUrls(self.parsed(name)):
                 yield link
     elif media_type.lower() == guess_type('toc.ncx'):
         for elem in self.parsed(name).xpath('//*[@src]'):
             yield (elem.get('src'), elem.sourceline,
                    0) if get_line_numbers else elem.get('src')
Example #17
0
def _get_css_imports_cssutils(data, inline=False):
    """Return all assets that are referenced in the given CSS document.

    The returned URLs are relative to the stylesheet's URL.

    Args:
        data: The content of the stylesheet to scan as string.
        inline: True if the argument is a inline HTML style attribute.
    """
    # We don't care about invalid CSS data, this will only litter the log
    # output with CSS errors
    parser = cssutils.CSSParser(loglevel=100,
                                fetcher=lambda url: (None, ""), validate=False)
    if not inline:
        sheet = parser.parseString(data)
        return list(cssutils.getUrls(sheet))
    else:
        urls = []
        declaration = parser.parseStyle(data)
        # prop = background, color, margin, ...
        for prop in declaration:
            # value = red, 10px, url(foobar), ...
            for value in prop.propertyValue:
                if isinstance(value, cssutils.css.URIValue):
                    if value.uri:
                        urls.append(value.uri)
        return urls
def scrape_style(url, zipper):
    """ Scrape any instances of url(...)
        Args:
            url (str): url to css file
            zipper (html_writer): zip to write to
        Returns str of css style rules
    """
    sheet = cssutils.parseUrl(url)
    rules = sheet.cssText.decode('utf-8')

    # Parse urls in css
    for url in cssutils.getUrls(sheet):
        try:
            # Download any urls in css to the shared asset directory (if not already there)
            filename = url.split('?')[0].split('/')[-1]
            filepath = os.path.sep.join([SHARED_ASSET_DIRECTORY, filename])
            if not os.path.isfile(filepath):
                with open(filepath, 'wb') as fobj:
                    fobj.write(read(url))

            # Replace text with new url
            new_url = zipper.write_file(filepath, filename, directory="assets")
            rules = rules.replace(url, "../" + new_url)

        except requests.exceptions.HTTPError:
            LOGGER.warning("Could not download css url {}".format(url))

    return rules
def acquire_css_files(html, soup, webpage_cursor, local_file_path, root_directory):
    #Find all css file locations
    for css_link in soup.find_all('link'):

        #Convert relative link to css file to an absolute link
        href = urlparse.urljoin(webpage_cursor, css_link['href'])

        if href not in css_url_list:
            css_url_list.append(href)


            current_directory = check_create_directory(href, local_file_path, root_directory)
            file_name = urlparse.urlparse(href).path.split('/')[-1]

            #Save this file to the directory
            request = urllib2.Request(href)
            try:
                responce = urllib2.urlopen(request)
                css = responce.read()
            except urllib2.URLError, e:
                raise Exception("%s returned an error: %s" % (href, e) )
                sys.exit(0)

            modified_css_sheet = cssutils.parseString(css)
            resource_urls = set(cssutils.getUrls(modified_css_sheet))
            modified_css_text = css

            print 'href_test: ', href
            file_depth = href.count('/') - 3
            depth_relative_link_slashes = '../' * file_depth

            for url in resource_urls:
                if url.startswith('/'):
                    modified_url = depth_relative_link_slashes + url[1:]
                    modified_css_text = modified_css_text.replace(url, modified_url)
                    print url
                    print 'modified_url: ', modified_url

            #Iterate over all internal resources on each css file
            try:
                if file_name is '':
                    file_loc = os.path.join(current_directory, 'application.css')
                elif file_name.endswith('.css'):
                    file_loc = os.path.join(current_directory, file_name)
                else:
                    file_loc = os.path.join(current_directory, (file_name + '.css') )
                css_file = open(file_loc, 'w')
                css_file.write(modified_css_text)
            except IOError as e:
                print 'IO Write Error: %s'%e
                sys.exit(0)
            finally:
                css_file.close()

            print("%s cloned..." % href)

            #Clone all associated resources with this css file
            clone_all_css_resources(css, current_directory, webpage_cursor, root_directory)
Example #20
0
def get_links_from_css(css_file, base_url):
    """Get all links from a CSS file."""
    result = []
    text = css_file.read()
    parsed = cssutils.parseString(text)
    all_urls = cssutils.getUrls(parsed)
    for url in all_urls:
        result.append(urljoin(base_url, url))
    return result
Example #21
0
    def save_css_assets(self, path):
        project_path = self.get_project_path()
        file_name = path.replace(project_path, '')
        try:
            css = cssutils.parseFile(path)
            urls = cssutils.getUrls(css)
        except:
            self.error_files.append(file_name)
            return
        file_path = file_name.rsplit('/', 1)[0] 
        for url in urls:
            if 'http' not in url and 'https' not in url:
                url = url.rsplit('/', 1)
                if len(url) == 1:
                    asset = '/' + url[0]
                    path = ''
                elif len(url) > 1:
                    asset = '/' + url[1]
                    path = '/' + url[0]
                else:
                    continue
                if "../" in path:
                    path_a = path.split("../")
                    if path_a[-1] != '':
                        sub_path = file_path.split('/')
                        for i in range(len(path_a) - 1):
                            sub_path = sub_path[:-1]
                            path = '/' + path_a[-1]
                        sub_path = '/'.join(sub_path)
                else:
                    sub_path = file_path

                if sub_path.startswith('/'):
                    sub_path = sub_path[1:]

                l = self.get_requestable_link(sub_path + path + asset)

                if l in self.visited_assets:
                    continue

                r = self.do_request(l, stream=True, type=self.asset_request)

                if r.status_code == 200:
                    file = asset.split('?')[0]
                    full_path = self.get_dir(sub_path + path,True)

                    if file.endswith('.css'):
                        with open(full_path + file, "wb") as f:
                            f.write(r.text.encode('utf-8'))
                            f.close()
                        self.save_css_assets(full_path + file)
                    else:
                        with open(full_path + file, "wb") as f:
                            shutil.copyfileobj(r.raw, f)
Example #22
0
    def reviews(self, app_id, page=1):
        """Sends a POST request and retrieves a list of reviews for
        the specified app.

        :param app_id: the app to retrieve details from, e.g. 'com.nintendo.zaaa'
        :param page: the page number to retrieve; max is 10
        :return: a list of reviews
        """
        data = {
            'reviewType': 0,
            'pageNum': page,
            'id': app_id,
            'reviewSortOrder': 4,
            'xhr': 1,
            'hl': self.language
        }
        self.params['authuser'] = '******'

        response = send_request('POST', s.REVIEW_URL, data, self.params)
        content = response.text
        content = content[content.find('[["ecr"'):].strip()
        data = json.loads(content)
        html = data[0][2]
        soup = BeautifulSoup(html, 'lxml', from_encoding='utf8')

        reviews = []
        for element in soup.select('.single-review'):
            review = {}

            avatar_style = element.select_one('.author-image').get('style')
            if avatar_style:
                sheet = cssutils.css.CSSStyleSheet()
                sheet.add('tmp { %s }' % avatar_style)
                review['author_image'] = list(cssutils.getUrls(sheet))[0]

            review_header = element.select_one('.review-header')
            review['review_id'] = review_header.get('data-reviewid', '')
            review['review_permalink'] = review_header.select_one('.reviews-permalink').get('href')

            review['author_name'] = review_header.select_one('.author-name').text
            review['review_date'] = review_header.select_one('.review-date').text

            curr_rating = review_header.select_one('.current-rating').get('style')
            review['current_rating'] = int(int(str(cssutils.parseStyle(curr_rating).width).replace('%', '')) / 20)

            body_elem = element.select_one('.review-body')
            review_title = body_elem.select_one('.review-title').extract()
            body_elem.select_one('.review-link').decompose()
            review['review_title'] = review_title.text
            review['review_body'] = body_elem.text

            reviews.append(review)

        return reviews
Example #23
0
    def get_body(self):
        while not self.new_urls.empty():
            current_url = yield from self.new_urls.get()
            if current_url in self.visited_urls:
                continue
            self.visited_urls.append(current_url)
            if current_url.name:
                file_name = current_url.name
            elif current_url.raw_path != '/':
                file_name = current_url.path.rsplit('/')[1]
            else:
                file_name = 'index.html'
            file_path = os.path.dirname(current_url.path)
            if file_path == '/':
                file_path = self.target_path
            else:
                file_path = os.path.join(self.target_path, file_path[1:])

            print('path: ', file_path, 'name: ', file_name)

            if file_path and not os.path.exists(file_path):
                os.makedirs(file_path)

            data = None
            try:
                with aiohttp.Timeout(10.0):
                    with aiohttp.ClientSession() as session:
                        response = yield from session.get(current_url)
                        data = yield from response.read()
            except aiohttp.ClientError as client_error:
                print(client_error)
            else:
                response.release()
                session.close()
            if data is not None:
                if re.match(re.compile('.*\.(html|php)'), file_name):
                    soup = yield from self.replace_links(data)
                    data = str(soup).encode()
                with open(os.path.join(file_path, file_name),
                          'wb') as index_fh:
                    index_fh.write(data)
                if '.css' in file_name:
                    css = cssutils.parseString(data)
                    for carved_url in cssutils.getUrls(css):
                        if carved_url.startswith('data'):
                            continue
                        carved_url = yarl.URL(carved_url)
                        if not carved_url.is_absolute():
                            carved_url = self.root.join(carved_url)
                        if carved_url not in self.visited_urls:
                            yield from self.new_urls.put(carved_url)
Example #24
0
 def __call__(self, oeb, context):
     import cssutils
     oeb.logger.info('Trimming unused files from manifest...')
     self.opts = context
     used = set()
     for term in oeb.metadata:
         for item in oeb.metadata[term]:
             if item.value in oeb.manifest.hrefs:
                 used.add(oeb.manifest.hrefs[item.value])
             elif item.value in oeb.manifest.ids:
                 used.add(oeb.manifest.ids[item.value])
     for ref in oeb.guide.values():
         path, _ = urldefrag(ref.href)
         if path in oeb.manifest.hrefs:
             used.add(oeb.manifest.hrefs[path])
     # TOC items are required to be in the spine
     for item in oeb.spine:
         used.add(item)
     unchecked = used
     while unchecked:
         new = set()
         for item in unchecked:
             if (item.media_type in OEB_DOCS or
                 item.media_type[-4:] in ('/xml', '+xml')) and \
                item.data is not None:
                 hrefs = [r[2] for r in iterlinks(item.data)]
                 for href in hrefs:
                     if isinstance(href, bytes):
                         href = href.decode('utf-8')
                     try:
                         href = item.abshref(urlnormalize(href))
                     except:
                         continue
                     if href in oeb.manifest.hrefs:
                         found = oeb.manifest.hrefs[href]
                         if found not in used:
                             new.add(found)
             elif item.media_type == CSS_MIME:
                 for href in cssutils.getUrls(item.data):
                     href = item.abshref(urlnormalize(href))
                     if href in oeb.manifest.hrefs:
                         found = oeb.manifest.hrefs[href]
                         if found not in used:
                             new.add(found)
         used.update(new)
         unchecked = new
     for item in oeb.manifest.values():
         if item not in used:
             oeb.logger.info('Trimming %r from manifest' % item.href)
             oeb.manifest.remove(item)
Example #25
0
 def __call__(self, oeb, context):
     import cssutils
     oeb.logger.info('Trimming unused files from manifest...')
     self.opts = context
     used = set()
     for term in oeb.metadata:
         for item in oeb.metadata[term]:
             if item.value in oeb.manifest.hrefs:
                 used.add(oeb.manifest.hrefs[item.value])
             elif item.value in oeb.manifest.ids:
                 used.add(oeb.manifest.ids[item.value])
     for ref in oeb.guide.values():
         path, _ = urldefrag(ref.href)
         if path in oeb.manifest.hrefs:
             used.add(oeb.manifest.hrefs[path])
     # TOC items are required to be in the spine
     for item in oeb.spine:
         used.add(item)
     unchecked = used
     while unchecked:
         new = set()
         for item in unchecked:
             if (item.media_type in OEB_DOCS or
                 item.media_type[-4:] in ('/xml', '+xml')) and \
                item.data is not None:
                 hrefs = [r[2] for r in iterlinks(item.data)]
                 for href in hrefs:
                     if isinstance(href, bytes):
                         href = href.decode('utf-8')
                     try:
                         href = item.abshref(urlnormalize(href))
                     except:
                         continue
                     if href in oeb.manifest.hrefs:
                         found = oeb.manifest.hrefs[href]
                         if found not in used:
                             new.add(found)
             elif item.media_type == CSS_MIME:
                 for href in cssutils.getUrls(item.data):
                     href = item.abshref(urlnormalize(href))
                     if href in oeb.manifest.hrefs:
                         found = oeb.manifest.hrefs[href]
                         if found not in used:
                             new.add(found)
         used.update(new)
         unchecked = new
     for item in oeb.manifest.values():
         if item not in used:
             oeb.logger.info('Trimming %r from manifest' % item.href)
             oeb.manifest.remove(item)
    def process_resource(self, content, url):
        try:
            sheet = cssutils.parseString(content, href=url)
        except Exception:
            log.warn("Error parsing %s", url)
            return

        linked_fetchables = []
        for img_url in cssutils.getUrls(sheet):
            #Ignore data: uris
            if not img_url.startswith("data:"):
                linked_fetchables.append(Image(urlparse.urljoin(url, img_url), self.sfh))

        return linked_fetchables
Example #27
0
    def get_body(self):
        while not self.new_urls.empty():
            current_url = yield from self.new_urls.get()
            if current_url in self.visited_urls:
                continue
            self.visited_urls.append(current_url)
            if current_url.name:
                file_name = current_url.name
            elif current_url.raw_path != '/':
                file_name = current_url.path.rsplit('/')[1]
            else:
                file_name = 'index.html'
            file_path = os.path.dirname(current_url.path)
            if file_path == '/':
                file_path = self.target_path
            else:
                file_path = os.path.join(self.target_path, file_path[1:])

            print('path: ', file_path, 'name: ', file_name)

            if file_path and not os.path.exists(file_path):
                os.makedirs(file_path)

            data = None
            try:
                with aiohttp.Timeout(10.0):
                    with aiohttp.ClientSession() as session:
                        response = yield from session.get(current_url)
                        data = yield from response.read()
            except aiohttp.ClientError as client_error:
                print(client_error)
            else:
                response.release()
                session.close()
            if data is not None:
                if re.match(re.compile('.*\.(html|php)'), file_name):
                    soup = yield from self.replace_links(data)
                    data = str(soup).encode()
                with open(os.path.join(file_path, file_name), 'wb') as index_fh:
                    index_fh.write(data)
                if '.css' in file_name:
                    css = cssutils.parseString(data)
                    for carved_url in cssutils.getUrls(css):
                        if carved_url.startswith('data'):
                            continue
                        carved_url = yarl.URL(carved_url)
                        if not carved_url.is_absolute():
                            carved_url = self.root.join(carved_url)
                        if carved_url not in self.visited_urls:
                            yield from self.new_urls.put(carved_url)
Example #28
0
    async def get_body(self, session):
        while not self.new_urls.empty():
            print(animation[self.itr % len(animation)], end="\r")
            self.itr = self.itr + 1
            current_url, level = await self.new_urls.get()
            if current_url.human_repr() in self.visited_urls:
                continue
            self.visited_urls.append(current_url.human_repr())
            file_name, hash_name = self._make_filename(current_url)
            self.logger.debug('Cloned file: %s', file_name)
            data = None
            content_type = None
            try:
                response = await session.get(current_url,
                                             headers={'Accept': 'text/html'},
                                             timeout=10.0)
                headers = self.get_headers(response)
                content_type = response.content_type
                data = await response.read()
            except (aiohttp.ClientError, asyncio.TimeoutError) as client_error:
                self.logger.error(client_error)
            else:
                await response.release()

            if data is not None:
                self.meta[file_name]['hash'] = hash_name
                self.meta[file_name]['headers'] = headers
                if (response.status in [401, 403]):
                    self.setting["auth_list"].append(file_name)
                self.counter = self.counter + 1

                if content_type == 'text/html':
                    soup = await self.replace_links(data, level)
                    data = str(soup).encode()
                elif content_type == 'text/css':
                    css = cssutils.parseString(data,
                                               validate=self.css_validate)
                    for carved_url in cssutils.getUrls(css):
                        if carved_url.startswith('data'):
                            continue
                        carved_url = yarl.URL(carved_url)
                        if not carved_url.is_absolute():
                            carved_url = self.root.join(carved_url)
                        if carved_url.human_repr() not in self.visited_urls:
                            await self.new_urls.put((carved_url, level + 1))

                with open(os.path.join(self.target_path, hash_name),
                          'wb') as index_fh:
                    index_fh.write(data)
Example #29
0
async def crawler(client, url_queue, archive):
    while True:
        url = await url_queue.get()
        try:
            log.debug(url)
            headers = ACCEPT_HEADERS
            headers['Referer'] = archive['top']
            response = await client.get(url, headers=headers)
            if response.status != 200:
                log.warn('BAD RESPONSE: {}: {}'.format(response.status, url))
            else:
                data = await response.read()
                content_type, params = parse_header(response.headers['content-type'])
                item = {
                    "WebResourceData": data,
                    "WebResourceMIMEType": content_type,
                    "WebResourceURL": url
                }
                if 'charset' in params:
                    item['WebResourceTextEncodingName'] = params['charset']
                # TODO: attempt to reproduce the way HTTP headers are stored (NSKeyedArchiver?)
                archive['items'].append(item)
                archive['seen'][url] = True
                if 'text/html' == content_type:
                    dom = html.fromstring(data)
                    patterns = ['//img/@src', '//img/@data-src', '//img/@data-src-retina', '//script/@src', "//link[@rel='stylesheet']/@href"]
                    for path in patterns:
                        for attr in dom.xpath(path):
                            log.debug("{}: {} {}".format(path, url, attr))
                            url = unquote(urljoin(url, urldefrag(attr)[0]))
                            if url not in archive['seen']:
                                archive['seen'][url] = True
                                await url_queue.put(url)
                elif 'text/css' == content_type:
                    # TODO: nested @import and better path inference
                    for attr in getUrls(parseString(data)):
                        log.debug(attr)
                        url = unquote(urljoin(url, urldefrag(attr)[0]))
                        if url not in archive['seen']:
                            archive['seen'][url] = True
                            await url_queue.put(url)
        except Exception as exc:
            log.warn('Exception {}:'.format(exc), exc_info=True)

        finally:
            url_queue.task_done()
Example #30
0
    async def get_body(self, session):
        while not self.new_urls.empty():
            current_url, level = await self.new_urls.get()
            if current_url.human_repr() in self.visited_urls:
                continue
            self.visited_urls.append(current_url.human_repr())
            file_name, hash_name = self._make_filename(current_url)
            print('name: ', file_name)
            self.meta[file_name] = {}

            data = None
            content_type = None
            try:
                response = await session.get(current_url,
                                             headers={'Accept': 'text/html'},
                                             timeout=10.0)
                content_type = response.content_type
                data = await response.read()

            except (aiohttp.ClientError, asyncio.TimeoutError) as client_error:
                self.logger.error(client_error)
            else:
                await response.release()
            if data is not None:
                self.meta[file_name]['hash'] = hash_name
                self.meta[file_name]['content_type'] = content_type
                if content_type == 'text/html':
                    soup = await self.replace_links(data, level)
                    data = str(soup).encode()
                with open(os.path.join(self.target_path, hash_name),
                          'wb') as index_fh:
                    index_fh.write(data)
                if content_type == 'text/css':
                    css = cssutils.parseString(data,
                                               validate=self.css_validate)
                    for carved_url in cssutils.getUrls(css):
                        if carved_url.startswith('data'):
                            continue
                        carved_url = yarl.URL(carved_url)
                        if not carved_url.is_absolute():
                            carved_url = self.root.join(carved_url)
                        if carved_url.human_repr() not in self.visited_urls:
                            await self.new_urls.put((carved_url, level + 1))
Example #31
0
    def test_getUrls(self):
        "cssutils.getUrls()"
        cssutils.ser.prefs.keepAllProperties = True

        css = '''
        @import "im1";
        @import url(im2);
        @import url( im3 );
        @import url( "im4" );
        @import url( 'im5' );
        a {
            background-image: url(c) !important;
            background-\image: url(b);
            background: url(a) no-repeat !important;
            }'''
        s = cssutils.parseString(css)
        urls = set(cssutils.getUrls(s))
        self.assertEqual(
            urls, set(["im1", "im2", "im3", "im4", "im5", "c", "b", "a"]))

        cssutils.ser.prefs.keepAllProperties = False
    def test_getUrls(self):
        "cssutils.getUrls()"
        cssutils.ser.prefs.keepAllProperties = True

        css='''
        @import "im1";
        @import url(im2);
        @import url( im3 );
        @import url( "im4" );
        @import url( 'im5' );
        a {
            background-image: url(c) !important;
            background-\image: url(b);
            background: url(a) no-repeat !important;
            }'''
        s = cssutils.parseString(css)
        urls = set(cssutils.getUrls(s))
        self.assertEqual(urls, set(["im1", "im2", "im3", "im4", "im5", 
                                    "c", "b", "a"]))

        cssutils.ser.prefs.keepAllProperties = False
Example #33
0
    async def get_body(self, session):
        while not self.new_urls.empty():
            current_url, level = await self.new_urls.get()
            if current_url.human_repr() in self.visited_urls:
                continue
            self.visited_urls.append(current_url.human_repr())
            file_name, hash_name = self._make_filename(current_url)
            print('name: ', file_name)
            self.meta[file_name] = {}

            data = None
            content_type = None
            try:
                response = await session.get(current_url, headers={'Accept': 'text/html'}, timeout=10.0)
                content_type = response.content_type
                data = await response.read()

            except (aiohttp.ClientError, asyncio.TimeoutError) as client_error:
                self.logger.error(client_error)
            else:
                await response.release()
            if data is not None:
                self.meta[file_name]['hash'] = hash_name
                self.meta[file_name]['content_type'] = content_type
                if content_type == 'text/html':
                    soup = await self.replace_links(data, level)
                    data = str(soup).encode()
                with open(os.path.join(self.target_path, hash_name), 'wb') as index_fh:
                    index_fh.write(data)
                if content_type == 'text/css':
                    css = cssutils.parseString(data, validate=self.css_validate)
                    for carved_url in cssutils.getUrls(css):
                        if carved_url.startswith('data'):
                            continue
                        carved_url = yarl.URL(carved_url)
                        if not carved_url.is_absolute():
                            carved_url = self.root.join(carved_url)
                        if carved_url.human_repr() not in self.visited_urls:
                            await self.new_urls.put((carved_url, level + 1))
Example #34
0
def css_download_url_refs(root_url, url_parts, sheet, dst_folder):
    """
    download images etc referenced in the css file
    """
    tmp_url_parts = list(urlparse.urlparse(deepcopy(root_url)))  # url_parts
    url_from_tmp_url_parts = urlparse.urlunparse(tmp_url_parts)
    urls = cssutils.getUrls(sheet)
    for url in urls:
        if url.startswith("/"):
            file_name = url.split("/")[-1]
            file_name = sanitize_file_name(file_name)
            new_src_url = sanitize_url(list(urlparse.urlparse(url))[2])
            new_src = create_directories(dst_folder, new_src_url)
            full_path = os.path.join(dst_folder, new_src)
            outpath = os.path.join(full_path, file_name)
            path = root_url + url
            if file_name != "":
                logging.debug("downloading css reference " + file_name + "..." + path)
                if url.lower().startswith("http"):
                    download_file(url, outpath)
                else:
                    download_file(path, outpath)
        else:
            file_name = url.split("/")[-1]
            file_name = sanitize_file_name(file_name)
            new_src_url = sanitize_url(list(urlparse.urlparse(url))[2])
            new_src = create_directories(dst_folder, new_src_url)
            full_path = os.path.join(dst_folder, new_src)
            outpath = os.path.join(full_path, file_name)
            path = root_url.replace(file_name, "") + url
            if file_name != "":
                logging.debug("downloading css reference " + file_name + "...")
                if url.lower().startswith("http"):
                    download_file(url, outpath)
                else:
                    download_file(path, outpath)
Example #35
0
 def show_urls(s, data):
     stylesheet = cssutils.parseString(s)  # parseFile (f)  # its a start :)
     #print [u for u in cssutils.getUrls (stylesheet)]
     for u in cssutils.getUrls(stylesheet):
         print u
Example #36
0
File: engine.py Project: jowolf/tlg
 def show_urls (s, data):
   stylesheet = cssutils.parseString (s)  # parseFile (f)  # its a start :)
   #print [u for u in cssutils.getUrls (stylesheet)]
   for u in cssutils.getUrls (stylesheet):
     print u
def geturls(stylesheet):
	"""
	Return a list of all URLs appearing in the :class:`CSSStyleSheet`
	:obj:`stylesheet`.
	"""
	return [url.URL(u) for u in cssutils.getUrls(stylesheet)] # This requires cssutils 0.9.5b1
Example #38
0
 def get_media_requests(self, item, info):
     sheet = CSSStyleSheet()
     sheet.cssText = item['content']
     urls = cssutils.getUrls(sheet)
     return [Request(u) for u in urls]
Example #39
0
 for i in range(len(a)):
     directory =  a[i]['href']
     if(".css" not in directory):
         print("-------Skipped for ---------",directory)
         continue
     if "http" in directory or "https" in directory:
         print ("------Skipped for ----- ",directory)
         continue
     print ('\t[+]Getting CSS = '+str(directory))
     if "/" not in directory:
             print ("\tNo directory. Saving file",directory)
     elif not os.path.exists(os.path.dirname(directory)):
         print ("    [DIR]Creating directory")
         os.makedirs(os.path.dirname(directory))
     testfile, headers = urlretrieve(baseurl+directory, directory, reporthook=report)   
     urls = list( cssutils.getUrls(cssutils.parseFile(directory)))
     if(len(urls)!=0):
         for link in urls:
             try:
                 if "http" in directory or "https" in link or "data:image/" in link:
                     print ("------Skipped for ----- ",link)
                     continue
                 while("../" in link):
                     if("assets" in link):
                         link = link[3:]
                     else:
                         link = "assets/"+link[3:]
                 print ('\t\t[+]Getting CSS-Image = '+str(link))
                 if "/" not in link:
                         print ("\t\tNo directory. Saving file",link)
                 elif not os.path.exists(os.path.dirname(link)):
Example #40
0
    def get_body(self, root_url, urls, visited_urls):
        if not root_url.startswith("http"):
            root_url = 'http://' + root_url
        visited_urls.append(root_url)
        parsed_url = urlparse(root_url)
        if parsed_url.fragment:
            return
        domain = parsed_url.netloc
        if not domain.endswith('/'):
            domain += '/'
        file_name = self.make_new_link(root_url)

        file_path = ''
        patt = '/.*/.*\.'
        if re.match(patt, file_name):
            file_path, file_name = file_name.rsplit('/', 1)
            file_path += '/'
        print('path: ', file_path, 'name: ', file_name)
        if len(domain) < 4:
            sys.exit('invalid taget {}'.format(root_url))
        page_path = '/opt/snare/pages/{}'.format(domain)

        if not os.path.exists(page_path):
            os.mkdir(page_path)

        if file_path and not os.path.exists(page_path + file_path):
            os.makedirs(page_path + file_path)

        data = None
        try:
            with aiohttp.Timeout(10.0):
                with aiohttp.ClientSession() as session:
                    response = yield from session.get(root_url)
                    data = yield from response.read()
        except Exception as e:
            print(e)
        else:
            response.release()
            session.close()
        if data is not None:
            if re.match(re.compile('.*\.(html|php)'), file_name):
                soup = self.replace_links(data, domain, urls)
                data = str(soup).encode()
            with open(page_path + file_path + file_name, 'wb') as index_fh:
                index_fh.write(data)
            if '.css' in file_name:
                css = cssutils.parseString(data)
                for carved_url in cssutils.getUrls(css):
                    if carved_url.startswith('data'):
                        continue
                    carved_url = os.path.normpath(
                        os.path.join(domain, carved_url))
                    if not carved_url.startswith('http'):
                        if carved_url.startswith(
                                '..') or carved_url.startswith('/'):
                            carved_url = 'http://' + domain + carved_url
                        else:
                            carved_url = 'http://' + carved_url
                    if carved_url not in visited_urls:
                        urls.insert(0, carved_url)
        for url in urls:
            urls.remove(url)
            if url in visited_urls:
                continue
            yield from self.get_body(url, urls, visited_urls)
    def reviews(self, app_id_list, page=0):
        #print("In Scraper - reviews def   \n")
        reviews_adder = []
        for n in range(len(app_id_list)):
            #app_id=app_id_list[n]
            #print(app_id)
            data = {
                'reviewType': 0,
                'pageNum': page,
                'id': app_id_list[n],
                'reviewSortOrder': 4,
                'xhr': 1,
                'hl': self.language
            }
            self.params['authuser'] = '******'
            #print('before send request')
            #print(app_id_list[n])
            response = send_request('POST', s.REVIEW_URL, data, self.params)
            content = response.text
            content = content[content.find('[["ecr"'):].strip()
            data = json.loads(content)
            #print(data)
            html = data[0][2]
            soup = BeautifulSoup(html, 'lxml', from_encoding='utf8')
            #print(soup)
            reviews = []
            for element in soup.select('.single-review'):
                #print('Inside single review')
                review = {}
                #print("In Scraper - reviews def- rev_app_id:: \n")
                #print(app_id)
                review['rev_app_id'] = app_id_list[n]
                avatar_style = element.select_one('.author-image').get('style')
                #print(avatar_style)
                if avatar_style:
                    sheet = cssutils.css.CSSStyleSheet()
                    sheet.add('tmp { %s }' % avatar_style)
                    review['author_image'] = list(cssutils.getUrls(sheet))[0]

                review_header = element.select_one('.review-header')
                review['review_id'] = review_header.get('data-reviewid', '')
                review['review_permalink'] = review_header.select_one(
                    '.reviews-permalink').get('href')

                review['author_name'] = review_header.select_one(
                    '.author-name').text
                review['review_date'] = review_header.select_one(
                    '.review-date').text

                curr_rating = review_header.select_one('.current-rating').get(
                    'style')
                review['current_rating'] = int(
                    int(
                        str(cssutils.parseStyle(curr_rating).width).replace(
                            '%', '')) / 20)

                body_elem = element.select_one('.review-body')
                review_title = body_elem.select_one('.review-title').extract()
                body_elem.select_one('.review-link').decompose()
                review['review_title'] = review_title.text
                review['review_body'] = body_elem.text

                reviews.append(review)
                reviews_adder.append(review)
                data = ''
        return reviews_adder
Example #42
0
    def run(self):
        # Step 0: ensure that the document_root and base_path variables are
        # set. If the file that's being processed was inside a source that has
        # either one or both not set, then this processor can't run.
        if self.document_root is None or self.base_path is None:
            raise DocumentRootAndBasePathRequiredException

        # We don't rename the file, so we can use the default output file.

        parser = CSSParser(log=None, loglevel=logging.critical)
        sheet = parser.parseFile(self.input_file)

        # Step 1: ensure the file has URLs. If it doesn't, we can stop the
        # processing.
        url_count = 0
        for url in getUrls(sheet):
            url_count += 1
            break
        if url_count == 0:
            return self.input_file

        # Step 2: resolve the relative URLs to absolute paths.
        replaceUrls(sheet, self.resolveToAbsolutePath)

        # Step 3: verify that each of these files has been synced.
        synced_files_db = urljoin(sys.path[0] + os.sep, SYNCED_FILES_DB)
        self.dbcon = sqlite3.connect(synced_files_db)
        self.dbcur = self.dbcon.cursor()
        all_synced = True
        for urlstring in getUrls(sheet):
            # Skip absolute URLs.
            if urlstring.startswith("http://") or urlstring.startswith(
                    "https://"):
                continue

            # Skip broken references in the CSS file. This would otherwise
            # prevent this CSS file from ever passing through this processor.
            if not os.path.exists(urlstring):
                continue

            # Get the CDN URL for the given absolute path.
            self.dbcur.execute(
                "SELECT url FROM synced_files WHERE input_file=?",
                (urlstring, ))
            result = self.dbcur.fetchone()

            if result == None:
                raise RequestToRequeueException(
                    "The file '%s' has not yet been synced to the server '%s'"
                    % (urlstring, self.process_for_server))
            else:
                cdn_url = result[0]

        # Step 4: resolve the absolute paths to CDN URLs.
        replaceUrls(sheet, self.resolveToCDNURL)

        # Step 5: write the updated CSS to the output file.
        f = open(self.output_file, 'w')
        f.write(sheet.cssText)
        f.close()

        return self.output_file
Example #43
0
def web_crawler(url, depth=0, page_assets=False):

    if depth >= 0:
        opener = request.build_opener()
        opener.add_headers = [{'User-Agent': 'Mozilla'}]
        request.install_opener(opener)

        base_url = "{0.scheme}://{0.netloc}/".format(parse.urlsplit(url))
        if url not in links:
            links.append(url)
        raw = requests.get(url).text
        if page_assets:
            try:
                sheet = cssutils.parseString(requests.get(url).content)
                urls = cssutils.getUrls(sheet)
                for url in urls:
                    if url not in links:
                        links.append(url)

                        path = request.urlopen(url)
                        meta = path.info()
                        print(url, ' size: ', meta.get(name="Content-Length"))
            except:
                pass

        soup = bs(raw, 'html.parser')

        for script in soup.find_all("script"):
            if script.attrs.get("src"):
                script_url = parse.urljoin(url, script.attrs.get("src"))
                if script_url not in assets:
                    path = request.urlopen(script_url)
                    meta = path.info()
                    print(script_url, ' size: ',
                          meta.get(name="Content-Length"))
                    assets.append(script_url)
                    if page_assets and script_url not in links:
                        links.append(script_url)
                        web_crawler(script_url, depth - 1, page_assets)

        for css in soup.find_all("link", {"rel": "stylesheet"}):
            if css.attrs.get("href"):
                css_url = parse.urljoin(url, css.attrs.get("href"))
                if css_url not in assets:
                    try:
                        path = request.urlopen(css_url)
                        meta = path.info()
                        print(css_url, ' ', 'size: ',
                              meta.get(name="Content-Length"))
                        assets.append(css_url)
                        if page_assets and css_url not in links:
                            links.append(css_url)
                            web_crawler(css_url, depth - 1, page_assets)
                    except:
                        pass

        for img in soup.find_all("img"):
            if img.get("src"):
                img_url = parse.urljoin(url, img.get("src"))
                try:
                    path = request.urlopen(img_url)
                    meta = path.info()

                    if img_url not in assets:
                        print(img_url, ' ', 'size: ',
                              meta.get(name="Content-Length"))
                        assets.append(img_url)
                except:
                    pass

        for a in soup.find_all('a'):
            href = str(a.get('href'))

            if 'http://' not in href and 'https://' not in href and base_url not in href:
                href = base_url + href[1:]

            if href not in links:
                path = request.urlopen(href)
                meta = path.info()

                print(href, ' ', 'size: ', meta.get(name="Content-Length"))

                links.append(href)
                web_crawler(href, depth - 1, page_assets)
Example #44
0
 def get_media_requests(self, item, info):
     sheet = CSSStyleSheet()
     sheet.cssText = item['content']
     urls = cssutils.getUrls(sheet)
     return [Request(u) for u in urls]
Example #45
0
    def _manifest_add_missing(self, invalid):
        import cssutils
        manifest = self.oeb.manifest
        known = set(manifest.hrefs)
        unchecked = set(manifest.values())
        cdoc = OEB_DOCS|OEB_STYLES
        invalid = set()
        while unchecked:
            new = set()
            for item in unchecked:
                data = None
                if (item.media_type in cdoc or
                        item.media_type[-4:] in ('/xml', '+xml')):
                    try:
                        data = item.data
                    except:
                        self.oeb.log.exception(u'Failed to read from manifest '
                                u'entry with id: %s, ignoring'%item.id)
                        invalid.add(item)
                        continue
                if data is None:
                    continue

                if (item.media_type in OEB_DOCS or
                        item.media_type[-4:] in ('/xml', '+xml')):
                    hrefs = [r[2] for r in iterlinks(data)]
                    for href in hrefs:
                        if isinstance(href, bytes):
                            href = href.decode('utf-8')
                        href, _ = urldefrag(href)
                        if not href:
                            continue
                        try:
                            href = item.abshref(urlnormalize(href))
                            scheme = urlparse(href).scheme
                        except:
                            self.oeb.log.exception(
                                'Skipping invalid href: %r'%href)
                            continue
                        if not scheme and href not in known:
                            new.add(href)
                elif item.media_type in OEB_STYLES:
                    try:
                        urls = list(cssutils.getUrls(data))
                    except:
                        urls = []
                    for url in urls:
                        href, _ = urldefrag(url)
                        href = item.abshref(urlnormalize(href))
                        scheme = urlparse(href).scheme
                        if not scheme and href not in known:
                            new.add(href)
            unchecked.clear()
            warned = set([])
            for href in new:
                known.add(href)
                is_invalid = False
                for item in invalid:
                    if href == item.abshref(urlnormalize(href)):
                        is_invalid = True
                        break
                if is_invalid:
                    continue
                if not self.oeb.container.exists(href):
                    if href not in warned:
                        self.logger.warn('Referenced file %r not found' % href)
                        warned.add(href)
                    continue
                if href not in warned:
                    self.logger.warn('Referenced file %r not in manifest' % href)
                    warned.add(href)
                id, _ = manifest.generate(id='added')
                guessed = guess_type(href)[0]
                media_type = guessed or BINARY_MIME
                added = manifest.add(id, href, media_type)
                unchecked.add(added)

            for item in invalid:
                self.oeb.manifest.remove(item)
Example #46
0
def get_gumroad_icon_url(data):
    soup = BeautifulSoup(data, 'html.parser')
    sheet = cssutils.css.CSSStyleSheet()
    sheet.add("dummy_selector { %s }" % soup.select_one(
        '.profile-picture-medium.js-profile-picture').get('style'))
    return list(cssutils.getUrls(sheet))[0]