コード例 #1
0
 def process_stylesheets(self, soup, baseurl):
     diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
     if not os.path.exists(diskpath):
         os.mkdir(diskpath)
     for c, tag in enumerate(soup.findAll(name=['link', 'style'])):
         try:
             mtype = tag['type']
         except KeyError:
             mtype = 'text/css' if tag.name.lower() == 'style' else ''
         if mtype.lower() != 'text/css':
             continue
         if tag.has_attr('href'):
             iurl = tag['href']
             if not urlsplit(iurl).scheme:
                 iurl = urljoin(baseurl, iurl, False)
             found_cached = False
             with self.stylemap_lock:
                 if iurl in self.stylemap:
                     tag['href'] = self.stylemap[iurl]
                     found_cached = True
             if found_cached:
                 continue
             try:
                 data = self.fetch_url(iurl)
             except Exception:
                 self.log.exception('Could not fetch stylesheet ', iurl)
                 continue
             stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
             with self.stylemap_lock:
                 self.stylemap[iurl] = stylepath
             with open(stylepath, 'wb') as x:
                 x.write(data)
             tag['href'] = stylepath
         else:
             for ns in tag.findAll(text=True):
                 src = str(ns)
                 m = self.__class__.CSS_IMPORT_PATTERN.search(src)
                 if m:
                     iurl = m.group(1)
                     if not urlsplit(iurl).scheme:
                         iurl = urljoin(baseurl, iurl, False)
                     found_cached = False
                     with self.stylemap_lock:
                         if iurl in self.stylemap:
                             ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
                             found_cached = True
                     if found_cached:
                         continue
                     try:
                         data = self.fetch_url(iurl)
                     except Exception:
                         self.log.exception('Could not fetch stylesheet ', iurl)
                         continue
                     c += 1
                     stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
                     with self.stylemap_lock:
                         self.stylemap[iurl] = stylepath
                     with open(stylepath, 'wb') as x:
                         x.write(data)
                     ns.replaceWith(src.replace(m.group(1), stylepath))
コード例 #2
0
 def process_stylesheets(self, soup, baseurl):
     diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
     if not os.path.exists(diskpath):
         os.mkdir(diskpath)
     for c, tag in enumerate(
             soup.findAll(lambda tag: tag.name.lower() in ['link', 'style']
                          and tag.has_key('type') and tag['type'].lower(
                          ) == 'text/css')):  # noqa
         if tag.has_key('href'):  # noqa
             iurl = tag['href']
             if not urlsplit(iurl).scheme:
                 iurl = urljoin(baseurl, iurl, False)
             with self.stylemap_lock:
                 if self.stylemap.has_key(iurl):  # noqa
                     tag['href'] = self.stylemap[iurl]
                     continue
             try:
                 data = self.fetch_url(iurl)
             except Exception:
                 self.log.exception('Could not fetch stylesheet ', iurl)
                 continue
             stylepath = os.path.join(diskpath, 'style' + str(c) + '.css')
             with self.stylemap_lock:
                 self.stylemap[iurl] = stylepath
             with open(stylepath, 'wb') as x:
                 x.write(data)
             tag['href'] = stylepath
         else:
             for ns in tag.findAll(text=True):
                 src = str(ns)
                 m = self.__class__.CSS_IMPORT_PATTERN.search(src)
                 if m:
                     iurl = m.group(1)
                     if not urlsplit(iurl).scheme:
                         iurl = urljoin(baseurl, iurl, False)
                     with self.stylemap_lock:
                         if self.stylemap.has_key(iurl):  # noqa
                             ns.replaceWith(
                                 src.replace(m.group(1),
                                             self.stylemap[iurl]))
                             continue
                     try:
                         data = self.fetch_url(iurl)
                     except Exception:
                         self.log.exception('Could not fetch stylesheet ',
                                            iurl)
                         continue
                     c += 1
                     stylepath = os.path.join(diskpath,
                                              'style' + str(c) + '.css')
                     with self.stylemap_lock:
                         self.stylemap[iurl] = stylepath
                     with open(stylepath, 'wb') as x:
                         x.write(data)
                     ns.replaceWith(src.replace(m.group(1), stylepath))
コード例 #3
0
ファイル: simple.py プロジェクト: JimmXinu/calibre
 def process_stylesheets(self, soup, baseurl):
     diskpath = unicode_path(os.path.join(self.current_dir, 'stylesheets'))
     if not os.path.exists(diskpath):
         os.mkdir(diskpath)
     for c, tag in enumerate(soup.findAll(name=['link', 'style'])):
         try:
             mtype = tag['type']
         except KeyError:
             mtype = 'text/css' if tag.name.lower() == 'style' else ''
         if mtype.lower() != 'text/css':
             continue
         if tag.has_attr('href'):
             iurl = tag['href']
             if not urlsplit(iurl).scheme:
                 iurl = urljoin(baseurl, iurl, False)
             with self.stylemap_lock:
                 if iurl in self.stylemap:
                     tag['href'] = self.stylemap[iurl]
                     continue
             try:
                 data = self.fetch_url(iurl)
             except Exception:
                 self.log.exception('Could not fetch stylesheet ', iurl)
                 continue
             stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
             with self.stylemap_lock:
                 self.stylemap[iurl] = stylepath
             with open(stylepath, 'wb') as x:
                 x.write(data)
             tag['href'] = stylepath
         else:
             for ns in tag.findAll(text=True):
                 src = str(ns)
                 m = self.__class__.CSS_IMPORT_PATTERN.search(src)
                 if m:
                     iurl = m.group(1)
                     if not urlsplit(iurl).scheme:
                         iurl = urljoin(baseurl, iurl, False)
                     with self.stylemap_lock:
                         if iurl in self.stylemap:
                             ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
                             continue
                     try:
                         data = self.fetch_url(iurl)
                     except Exception:
                         self.log.exception('Could not fetch stylesheet ', iurl)
                         continue
                     c += 1
                     stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
                     with self.stylemap_lock:
                         self.stylemap[iurl] = stylepath
                     with open(stylepath, 'wb') as x:
                         x.write(data)
                     ns.replaceWith(src.replace(m.group(1), stylepath))
コード例 #4
0
 def absurl(self, baseurl, tag, key, filter=True):
     iurl = tag[key]
     parts = urlsplit(iurl)
     if not parts.netloc and not parts.path and not parts.query:
         return None
     if not parts.scheme:
         iurl = urljoin(baseurl, iurl, False)
     if not self.is_link_ok(iurl):
         self.log.debug('Skipping invalid link:', iurl)
         return None
     if filter and not self.is_link_wanted(iurl, tag):
         self.log.debug('Filtered link: ' + iurl)
         return None
     return iurl
コード例 #5
0
ファイル: simple.py プロジェクト: JimmXinu/calibre
 def absurl(self, baseurl, tag, key, filter=True):
     iurl = tag[key]
     parts = urlsplit(iurl)
     if not parts.netloc and not parts.path and not parts.query:
         return None
     if not parts.scheme:
         iurl = urljoin(baseurl, iurl, False)
     if not self.is_link_ok(iurl):
         self.log.debug('Skipping invalid link:', iurl)
         return None
     if filter and not self.is_link_wanted(iurl, tag):
         self.log.debug('Filtered link: '+iurl)
         return None
     return iurl
コード例 #6
0
    def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
        res = ''
        diskpath = os.path.join(self.current_dir, into_dir)
        if not os.path.exists(diskpath):
            os.mkdir(diskpath)
        prev_dir = self.current_dir
        try:
            self.current_dir = diskpath
            tags = list(soup.findAll('a', href=True))

            for c, tag in enumerate(tags):
                if self.show_progress:
                    print('.', end=' ')
                    sys.stdout.flush()
                sys.stdout.flush()
                iurl = self.absurl(baseurl,
                                   tag,
                                   'href',
                                   filter=recursion_level != 0)
                if not iurl:
                    continue
                nurl = self.normurl(iurl)
                if self.filemap.has_key(nurl):  # noqa
                    self.localize_link(tag, 'href', self.filemap[nurl])
                    continue
                if self.files > self.max_files:
                    return res
                linkdir = 'link' + str(c) if into_dir else ''
                linkdiskpath = os.path.join(diskpath, linkdir)
                if not os.path.exists(linkdiskpath):
                    os.mkdir(linkdiskpath)
                try:
                    self.current_dir = linkdiskpath
                    dsrc = self.fetch_url(iurl)
                    newbaseurl = dsrc.newurl
                    if len(dsrc) == 0 or \
                       len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0:
                        raise ValueError('No content at URL %r' % iurl)
                    if callable(self.encoding):
                        dsrc = self.encoding(dsrc)
                    elif self.encoding is not None:
                        dsrc = dsrc.decode(self.encoding, 'replace')
                    else:
                        dsrc = xml_to_unicode(dsrc, self.verbose)[0]

                    st = time.time()
                    soup = self.get_soup(dsrc, url=iurl)
                    self.log.debug('Parsed %s in %.1f seconds' %
                                   (iurl, time.time() - st))

                    base = soup.find('base', href=True)
                    if base is not None:
                        newbaseurl = base['href']
                    self.log.debug('Processing images...')
                    self.process_images(soup, newbaseurl)
                    if self.download_stylesheets:
                        self.process_stylesheets(soup, newbaseurl)

                    _fname = basename(iurl)
                    if not isinstance(_fname, unicode_type):
                        _fname.decode('latin1', 'replace')
                    _fname = _fname.encode('ascii', 'replace').replace(
                        '%', '').replace(os.sep, '')
                    _fname = ascii_filename(_fname)
                    _fname = os.path.splitext(_fname)[0][:120] + '.xhtml'
                    res = os.path.join(linkdiskpath, _fname)
                    self.downloaded_paths.append(res)
                    self.filemap[nurl] = res
                    if recursion_level < self.max_recursions:
                        self.log.debug('Processing links...')
                        self.process_links(soup, newbaseurl,
                                           recursion_level + 1)
                    else:
                        self.process_return_links(soup, newbaseurl)
                        self.log.debug(
                            'Recursion limit reached. Skipping links in', iurl)

                    if newbaseurl and not newbaseurl.startswith('/'):
                        for atag in soup.findAll(
                                'a', href=lambda x: x and x.startswith('/')):
                            atag['href'] = urljoin(newbaseurl, atag['href'],
                                                   True)
                    if callable(self.postprocess_html_ext):
                        soup = self.postprocess_html_ext(
                            soup, c == 0 and recursion_level == 0
                            and not getattr(self, 'called_first', False),
                            self.job_info)

                        if c == 0 and recursion_level == 0:
                            self.called_first = True

                    save_soup(soup, res)
                    self.localize_link(tag, 'href', res)
                except Exception as err:
                    if isinstance(err, AbortArticle):
                        raise
                    self.failed_links.append((iurl, traceback.format_exc()))
                    self.log.exception('Could not fetch link', iurl)
                finally:
                    self.current_dir = diskpath
                    self.files += 1
        finally:
            self.current_dir = prev_dir
        if self.show_progress:
            print()
        return res
コード例 #7
0
 def process_images(self, soup, baseurl):
     diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
     if not os.path.exists(diskpath):
         os.mkdir(diskpath)
     c = 0
     for tag in soup.findAll(lambda tag: tag.name.lower() == 'img' and tag.
                             has_key('src')):  # noqa
         iurl = tag['src']
         if iurl.startswith('data:image/'):
             try:
                 data = b64decode(iurl.partition(',')[-1])
             except:
                 self.log.exception('Failed to decode embedded image')
                 continue
         else:
             if callable(self.image_url_processor):
                 iurl = self.image_url_processor(baseurl, iurl)
             if not urlsplit(iurl).scheme:
                 iurl = urljoin(baseurl, iurl, False)
             with self.imagemap_lock:
                 if self.imagemap.has_key(iurl):  # noqa
                     tag['src'] = self.imagemap[iurl]
                     continue
             try:
                 data = self.fetch_url(iurl)
                 if data == 'GIF89a\x01':
                     # Skip empty GIF files as PIL errors on them anyway
                     continue
             except Exception:
                 self.log.exception('Could not fetch image ', iurl)
                 continue
         c += 1
         fname = ascii_filename('img' + str(c))
         if isinstance(fname, unicode_type):
             fname = fname.encode('ascii', 'replace')
         data = self.preprocess_image_ext(
             data, iurl) if self.preprocess_image_ext is not None else data
         if data is None:
             continue
         itype = what(None, data)
         if itype == 'svg' or (itype is None and b'<svg' in data[:1024]):
             # SVG image
             imgpath = os.path.join(diskpath, fname + '.svg')
             with self.imagemap_lock:
                 self.imagemap[iurl] = imgpath
             with open(imgpath, 'wb') as x:
                 x.write(data)
             tag['src'] = imgpath
         else:
             try:
                 # Ensure image is valid
                 img = image_from_data(data)
                 if itype not in {'png', 'jpg', 'jpeg'}:
                     itype = 'png' if itype == 'gif' else 'jpeg'
                     data = image_to_data(img, fmt=itype)
                 if self.compress_news_images and itype in {'jpg', 'jpeg'}:
                     try:
                         data = self.rescale_image(data)
                     except Exception:
                         self.log.exception('failed to compress image ' +
                                            iurl)
                 # Moon+ apparently cannot handle .jpeg files
                 if itype == 'jpeg':
                     itype = 'jpg'
                 imgpath = os.path.join(diskpath, fname + '.' + itype)
                 with self.imagemap_lock:
                     self.imagemap[iurl] = imgpath
                 with open(imgpath, 'wb') as x:
                     x.write(data)
                 tag['src'] = imgpath
             except Exception:
                 traceback.print_exc()
                 continue
コード例 #8
0
ファイル: simple.py プロジェクト: JimmXinu/calibre
    def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
        res = ''
        diskpath = os.path.join(self.current_dir, into_dir)
        if not os.path.exists(diskpath):
            os.mkdir(diskpath)
        prev_dir = self.current_dir
        try:
            self.current_dir = diskpath
            tags = list(soup.findAll('a', href=True))

            for c, tag in enumerate(tags):
                if self.show_progress:
                    print('.', end=' ')
                    sys.stdout.flush()
                sys.stdout.flush()
                iurl = self.absurl(baseurl, tag, 'href', filter=recursion_level != 0)
                if not iurl:
                    continue
                nurl = self.normurl(iurl)
                if nurl in self.filemap:
                    self.localize_link(tag, 'href', self.filemap[nurl])
                    continue
                if self.files > self.max_files:
                    return res
                linkdir = 'link'+str(c) if into_dir else ''
                linkdiskpath = os.path.join(diskpath, linkdir)
                if not os.path.exists(linkdiskpath):
                    os.mkdir(linkdiskpath)
                try:
                    self.current_dir = linkdiskpath
                    dsrc = self.fetch_url(iurl)
                    newbaseurl = dsrc.newurl
                    if len(dsrc) == 0 or \
                       len(re.compile(b'<!--.*?-->', re.DOTALL).sub(b'', dsrc).strip()) == 0:
                        raise ValueError('No content at URL %r'%iurl)
                    if callable(self.encoding):
                        dsrc = self.encoding(dsrc)
                    elif self.encoding is not None:
                        dsrc = dsrc.decode(self.encoding, 'replace')
                    else:
                        dsrc = xml_to_unicode(dsrc, self.verbose)[0]

                    st = time.time()
                    soup = self.get_soup(dsrc, url=iurl)
                    self.log.debug('Parsed %s in %.1f seconds' % (iurl, time.time() - st))

                    base = soup.find('base', href=True)
                    if base is not None:
                        newbaseurl = base['href']
                    self.log.debug('Processing images...')
                    self.process_images(soup, newbaseurl)
                    if self.download_stylesheets:
                        self.process_stylesheets(soup, newbaseurl)

                    _fname = basename(iurl)
                    if not isinstance(_fname, unicode_type):
                        _fname.decode('latin1', 'replace')
                    _fname = _fname.replace('%', '').replace(os.sep, '')
                    _fname = ascii_filename(_fname)
                    _fname = os.path.splitext(_fname)[0][:120] + '.xhtml'
                    res = os.path.join(linkdiskpath, _fname)
                    self.downloaded_paths.append(res)
                    self.filemap[nurl] = res
                    if recursion_level < self.max_recursions:
                        self.log.debug('Processing links...')
                        self.process_links(soup, newbaseurl, recursion_level+1)
                    else:
                        self.process_return_links(soup, newbaseurl)
                        self.log.debug('Recursion limit reached. Skipping links in', iurl)

                    if newbaseurl and not newbaseurl.startswith('/'):
                        for atag in soup.findAll('a', href=lambda x: x and x.startswith('/')):
                            atag['href'] = urljoin(newbaseurl, atag['href'], True)
                    if callable(self.postprocess_html_ext):
                        soup = self.postprocess_html_ext(soup,
                                c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
                                self.job_info)

                        if c==0 and recursion_level == 0:
                            self.called_first = True

                    save_soup(soup, res)
                    self.localize_link(tag, 'href', res)
                except Exception as err:
                    if isinstance(err, AbortArticle):
                        raise
                    self.failed_links.append((iurl, traceback.format_exc()))
                    self.log.exception('Could not fetch link', iurl)
                finally:
                    self.current_dir = diskpath
                    self.files += 1
        finally:
            self.current_dir = prev_dir
        if self.show_progress:
            print()
        return res
コード例 #9
0
ファイル: simple.py プロジェクト: JimmXinu/calibre
 def process_images(self, soup, baseurl):
     diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
     if not os.path.exists(diskpath):
         os.mkdir(diskpath)
     c = 0
     for tag in soup.findAll('img', src=True):
         iurl = tag['src']
         if iurl.startswith('data:image/'):
             try:
                 data = from_base64_bytes(iurl.partition(',')[-1])
             except Exception:
                 self.log.exception('Failed to decode embedded image')
                 continue
         else:
             if callable(self.image_url_processor):
                 iurl = self.image_url_processor(baseurl, iurl)
             if not urlsplit(iurl).scheme:
                 iurl = urljoin(baseurl, iurl, False)
             with self.imagemap_lock:
                 if iurl in self.imagemap:
                     tag['src'] = self.imagemap[iurl]
                     continue
             try:
                 data = self.fetch_url(iurl)
                 if data == 'GIF89a\x01':
                     # Skip empty GIF files as PIL errors on them anyway
                     continue
             except Exception:
                 self.log.exception('Could not fetch image ', iurl)
                 continue
         c += 1
         fname = ascii_filename('img'+str(c))
         data = self.preprocess_image_ext(data, iurl) if self.preprocess_image_ext is not None else data
         if data is None:
             continue
         itype = what(None, data)
         if itype == 'svg' or (itype is None and b'<svg' in data[:1024]):
             # SVG image
             imgpath = os.path.join(diskpath, fname+'.svg')
             with self.imagemap_lock:
                 self.imagemap[iurl] = imgpath
             with open(imgpath, 'wb') as x:
                 x.write(data)
             tag['src'] = imgpath
         else:
             try:
                 # Ensure image is valid
                 img = image_from_data(data)
                 if itype not in {'png', 'jpg', 'jpeg'}:
                     itype = 'png' if itype == 'gif' else 'jpeg'
                     data = image_to_data(img, fmt=itype)
                 if self.compress_news_images and itype in {'jpg','jpeg'}:
                     try:
                         data = self.rescale_image(data)
                     except Exception:
                         self.log.exception('failed to compress image '+iurl)
                 # Moon+ apparently cannot handle .jpeg files
                 if itype == 'jpeg':
                     itype = 'jpg'
                 imgpath = os.path.join(diskpath, fname+'.'+itype)
                 with self.imagemap_lock:
                     self.imagemap[iurl] = imgpath
                 with open(imgpath, 'wb') as x:
                     x.write(data)
                 tag['src'] = imgpath
             except Exception:
                 traceback.print_exc()
                 continue
コード例 #10
0
 def process_images(self, soup, baseurl):
     diskpath = unicode_path(os.path.join(self.current_dir, 'images'))
     if not os.path.exists(diskpath):
         os.mkdir(diskpath)
     c = 0
     for tag in soup.findAll('img', src=True):
         iurl = tag['src']
         if iurl.startswith('data:'):
             try:
                 data = urlopen(iurl).read()
             except Exception:
                 self.log.exception('Failed to decode embedded image')
                 continue
         else:
             if callable(self.image_url_processor):
                 iurl = self.image_url_processor(baseurl, iurl)
             if not urlsplit(iurl).scheme:
                 iurl = urljoin(baseurl, iurl, False)
             found_in_cache = False
             with self.imagemap_lock:
                 if iurl in self.imagemap:
                     tag['src'] = self.imagemap[iurl]
                     found_in_cache = True
             if found_in_cache:
                 continue
             try:
                 data = self.fetch_url(iurl)
                 if data == b'GIF89a\x01':
                     # Skip empty GIF files as PIL errors on them anyway
                     continue
             except Exception:
                 self.log.exception('Could not fetch image ', iurl)
                 continue
         c += 1
         fname = ascii_filename('img' + str(c))
         data = self.preprocess_image_ext(
             data, iurl) if self.preprocess_image_ext is not None else data
         if data is None:
             continue
         itype = what(None, data)
         if itype == 'svg' or (itype is None and b'<svg' in data[:1024]):
             # SVG image
             imgpath = os.path.join(diskpath, fname + '.svg')
             with self.imagemap_lock:
                 self.imagemap[iurl] = imgpath
             with open(imgpath, 'wb') as x:
                 x.write(data)
             tag['src'] = imgpath
         else:
             from calibre.utils.img import image_from_data, image_to_data
             try:
                 # Ensure image is valid
                 img = image_from_data(data)
                 if itype not in {'png', 'jpg', 'jpeg'}:
                     itype = 'png' if itype == 'gif' else 'jpeg'
                     data = image_to_data(img, fmt=itype)
                 if self.compress_news_images and itype in {'jpg', 'jpeg'}:
                     try:
                         data = self.rescale_image(data)
                     except Exception:
                         self.log.exception('failed to compress image ' +
                                            iurl)
                 # Moon+ apparently cannot handle .jpeg files
                 if itype == 'jpeg':
                     itype = 'jpg'
                 imgpath = os.path.join(diskpath, fname + '.' + itype)
                 with self.imagemap_lock:
                     self.imagemap[iurl] = imgpath
                 with open(imgpath, 'wb') as x:
                     x.write(data)
                 tag['src'] = imgpath
             except Exception:
                 traceback.print_exc()
                 continue