def __getitem__(self, book_id): with self.lock: if not hasattr(self, 'total_size'): self._load_index() self._invalidate_sizes() key = (self.group_id, book_id) entry = self.items.pop(key, None) if entry is None: return None, None if entry.thumbnail_size != self.thumbnail_size: try: os.remove(entry.path) except EnvironmentError as err: if getattr(err, 'errno', None) != errno.ENOENT: self.log('Failed to remove cached thumbnail:', entry.path, as_unicode(err)) self.total_size -= entry.size return None, None self.items[key] = entry try: with open(entry.path, 'rb') as f: data = f.read() except EnvironmentError as err: self.log('Failed to read cached thumbnail:', entry.path, as_unicode(err)) return None, None return data, entry.timestamp
def insert(self, book_id, timestamp, data): if self.max_size < len(data): return with self.lock: if not hasattr(self, 'total_size'): self._load_index() self._invalidate_sizes() ts = ('%.2f' % timestamp).replace('.00', '') path = '%s%s%s%s%d-%s-%d-%dx%d' % ( self.group_id, os.sep, book_id % 100, os.sep, book_id, ts, len(data), self.thumbnail_size[0], self.thumbnail_size[1]) path = os.path.join(self.location, path) key = (self.group_id, book_id) e = self.items.pop(key, None) self.total_size -= getattr(e, 'size', 0) try: with open(path, 'wb') as f: f.write(data) except EnvironmentError as err: d = os.path.dirname(path) if not os.path.exists(d): try: os.makedirs(d) with open(path, 'wb') as f: f.write(data) except EnvironmentError as err: self.log('Failed to write cached thumbnail:', path, as_unicode(err)) return self._apply_size() else: self.log('Failed to write cached thumbnail:', path, as_unicode(err)) return self._apply_size() self.items[key] = Entry(path, len(data), timestamp, self.thumbnail_size) self.total_size += len(data) self._apply_size()
def run(self): if self.tdir is not None: try: self.extract() except Exception as err: import traceback traceback.print_exc() msg = as_unicode(err) self.found.emit(msg) return self.path = self.tdir root = os.path.abspath(self.path) try: self.walk(root) except: try: if isinstance(root, unicode): root = root.encode(filesystem_encoding) self.walk(root) except Exception as err: import traceback traceback.print_exc() msg = as_unicode(err) self.found.emit(msg) return self.books = [formats for formats in self.books if formats] if not self.canceled: self.found.emit(self.books)
def main(report_error=prints, report_action=prints): try: if time.time() - cache.mtime() < UPDATE_INTERVAL: report_action('Metadata sources cache was recently updated not updating again') return try: report_action('Fetching metadata source hashes...') needed = update_needed() except Exception as e: report_error( 'Failed to get metadata sources hashes with error: {}'.format(as_unicode(e))) return if not needed: cache.touch() return updated = {} for name, expected_hash in needed.iteritems(): report_action('Updating metadata source {}...'.format(name)) try: update_plugin(name, updated, expected_hash) except Exception as e: report_error('Failed to get plugin {} with error: {}'.format( name, as_unicode(e))) break else: hashes = cache.get('hashes', {}) for name in updated: hashes[name] = updated[name][1] with cache: cache['hashes'] = hashes for name in updated: cache[name] = updated[name][0] finally: update_sources.worker = None
def daemonize(): # {{{ try: pid = os.fork() if pid > 0: # exit first parent sys.exit(0) except OSError as e: raise SystemExit('fork #1 failed: %s' % as_unicode(e)) # decouple from parent environment os.chdir("/") os.setsid() os.umask(0) # do second fork try: pid = os.fork() if pid > 0: # exit from second parent sys.exit(0) except OSError as e: raise SystemExit('fork #2 failed: %s' % as_unicode(e)) # Redirect standard file descriptors. plugins['speedup'][0].detach(os.devnull)
def monitor_scan(self): self.scan_thread.join(0.05) if self.scan_thread.is_alive(): self.do_one_signal.emit() return if self.scan_error is not None: error_dialog(self.pd, _('Cannot add books'), _( 'Failed to add any books, click "Show details" for more information.'), det_msg=self.scan_error, show=True) self.break_cycles() return if not self.file_groups: error_dialog(self.pd, _('Could not add'), _( 'No ebook files were found in %s') % self.source, show=True) self.break_cycles() return self.pd.max = len(self.file_groups) self.pd.title = _('Reading metadata and adding to library (%d books)...') % self.pd.max self.pd.msg = '' self.pd.value = 0 self.pool = Pool(name='AddBooks') if self.pool is None else self.pool if self.db is not None: if self.add_formats_to_existing: self.find_identical_books_data = self.db.data_for_find_identical_books() else: try: self.pool.set_common_data(self.db.data_for_has_book()) except Failure as err: error_dialog(self.pd, _('Cannot add books'), _( 'Failed to add any books, click "Show details" for more information.'), det_msg=as_unicode(err.failure_message) + '\n' + as_unicode(err.details), show=True) self.pd.canceled = True self.groups_to_add = iter(self.file_groups) self.do_one = self.do_one_group self.do_one_signal.emit()
def more_books(ctx, rd): """ Get more results from the specified search-query, which must be specified as JSON in the request body. Optional: ?num=50&library_id=<default library> """ db, library_id = get_library_data(ctx, rd.query)[:2] try: num = int(rd.query.get("num", DEFAULT_NUMBER_OF_BOOKS)) except Exception: raise HTTPNotFound("Invalid number of books: %r" % rd.query.get("num")) try: search_query = load_json_file(rd.request_body_file) query, offset, sorts, orders = ( search_query["query"], search_query["offset"], search_query["sort"], search_query["sort_order"], ) except KeyError as err: raise HTTPBadRequest("Search query missing key: %s" % as_unicode(err)) except Exception as err: raise HTTPBadRequest("Invalid query: %s" % as_unicode(err)) ans = {} with db.safe_read_lock: ans["search_result"] = search_result(ctx, rd, db, query, num, offset, sorts, orders) mdata = ans["metadata"] = {} for book_id in ans["search_result"]["book_ids"]: data = book_as_json(db, book_id) if data is not None: mdata[book_id] = data return ans
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=60): # {{{ from lxml import etree from calibre.ebooks.chardet import xml_to_unicode if not self.is_configured(): return query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: err = u'Insufficient metadata to construct query' log.error(err) return err try: raw = self.browser.open_novisit(query).read() except Exception as e: log.exception(u'Failed to make identify query: %r'%query) return as_unicode(e) try: parser = etree.XMLParser(recover=True, no_network=True) feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser) entries = feed.xpath('//*[local-name()="SearchItems" or local-name()="ItemDetail"]') if entries: metadata = self.get_metadata(log, entries, title, authors, identifiers) self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout) except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e)
def daemonize(): # {{{ try: pid = os.fork() if pid > 0: # exit first parent sys.exit(0) except OSError as e: raise SystemExit("fork #1 failed: %s" % as_unicode(e)) # decouple from parent environment os.chdir("/") os.setsid() os.umask(0) # do second fork try: pid = os.fork() if pid > 0: # exit from second parent sys.exit(0) except OSError as e: raise SystemExit("fork #2 failed: %s" % as_unicode(e)) # Redirect standard file descriptors. try: plugins["speedup"][0].detach(os.devnull) except AttributeError: # people running from source without updated binaries si = os.open(os.devnull, os.O_RDONLY) so = os.open(os.devnull, os.O_WRONLY) se = os.open(os.devnull, os.O_WRONLY) os.dup2(si, sys.stdin.fileno()) os.dup2(so, sys.stdout.fileno()) os.dup2(se, sys.stderr.fileno())
def more_books(ctx, rd): ''' Get more results from the specified search-query, which must be specified as JSON in the request body. Optional: ?num=50&library_id=<default library> ''' db, library_id = get_library_data(ctx, rd.query)[:2] try: num = int(rd.query.get('num', DEFAULT_NUMBER_OF_BOOKS)) except Exception: raise HTTPNotFound('Invalid number of books: %r' % rd.query.get('num')) try: search_query = load_json_file(rd.request_body_file) query, offset, sorts, orders = search_query['query'], search_query['offset'], search_query['sort'], search_query['sort_order'] except KeyError as err: raise HTTPBadRequest('Search query missing key: %s' % as_unicode(err)) except Exception as err: raise HTTPBadRequest('Invalid query: %s' % as_unicode(err)) ans = {} with db.safe_read_lock: ans['search_result'] = search_result(ctx, rd, db, query, num, offset, sorts, orders) mdata = ans['metadata'] = {} for book_id in ans['search_result']['book_ids']: data = book_as_json(db, book_id) if data is not None: mdata[book_id] = data return ans
def parse(self, xml_detail, xml_more_info): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) isbn = self.parse_isbn(xml_more_info) publisher = self.parse_publisher(xml_detail) tags = self.parse_tags(xml_detail, xml_more_info) serie, serie_index = self.parse_serie(xml_detail) pub_year = self.parse_pub_year(xml_detail, xml_more_info) cover = self.parse_cover(xml_detail) if title is not None and authors is not None: mi = Metadata(as_unicode(title), authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:self.ident} mi.rating = rating mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(self.ident, cover) return mi else: self.log('Result skipped for because title or authors not found') return None
def download_parse(self, query, timeout): # self.downloads_count += 1 # number = self.downloads_count br = self.browser try: self.log('download page search %s'%query) data = urllib.urlencode(query[1]) raw = br.open(query[0],data,timeout=timeout).read().strip() except Exception as e: self.log.exception('Failed to make identify query: %r'%query) return as_unicode(e) try: parser = etree.HTMLParser(recover=True) clean = clean_ascii_chars(raw) # self.log.filelog(clean, "\\tmp\\test.html") feed = fromstring(clean, parser=parser) # if len(parser.error_log) > 0: #some errors while parsing # self.log('while parsing page occus some errors:') # self.log(parser.error_log) return feed except Exception as e: self.log.exception('Failed to parse identify results') return as_unicode(e)
def _doit(self, newdb): for i, x in enumerate(self.ids): try: self.do_one(i, x, newdb) except Exception as err: import traceback err = as_unicode(err) self.failed_books[x] = (err, as_unicode(traceback.format_exc()))
def _doit(self, newdb): for i, x in enumerate(self.ids): if self.was_canceled: self.left_after_cancel = len(self.ids) - i break try: self.do_one(i, x, newdb) except Exception as err: import traceback err = as_unicode(err) self.failed_books[x] = (err, as_unicode(traceback.format_exc()))
def fset(self, val): if self.type in ('checkbox', 'radio'): if val: self.qwe.setAttribute('checked', 'checked') else: self.qwe.removeAttribute('checked') elif self.type in ('text', 'password', 'hidden', 'email', 'search'): self.qwe.setAttribute('value', as_unicode(val)) elif self.type in ('number', 'range'): self.qwe.setAttribute('value', '%d'%int(val)) else: # Unknown type treat as text self.qwe.setAttribute('value', as_unicode(val))
def get_image_urls(self, title, author, log, abort, timeout): from calibre.utils.ipc.simple_worker import fork_job, WorkerError try: return fork_job('calibre.ebooks.metadata.sources.google_images', 'search', args=(title, author, self.prefs['size'], timeout), no_output=True, abort=abort, timeout=timeout)['result'] except WorkerError as e: if e.orig_tb: log.error(e.orig_tb) log.exception('Searching google failed:' + as_unicode(e)) except Exception as e: log.exception('Searching google failed:' + as_unicode(e)) return []
def do_scan(self): self.reload_cache() if isworker: # Dont scan font files in worker processes, use whatever is # cached. Font files typically dont change frequently enough to # justify a rescan in a worker process. self.build_families() return cached_fonts = self.cached_fonts.copy() self.cached_fonts.clear() for folder in self.folders: if not os.path.isdir(folder): continue try: files = tuple(walk(folder)) except EnvironmentError as e: if DEBUG: prints('Failed to walk font folder:', folder, as_unicode(e)) continue for candidate in files: if (candidate.rpartition('.')[-1].lower() not in self.allowed_extensions or not os.path.isfile(candidate)): continue candidate = os.path.normcase(os.path.abspath(candidate)) try: s = os.stat(candidate) except EnvironmentError: continue fileid = '{0}||{1}:{2}'.format(candidate, s.st_size, s.st_mtime) if fileid in cached_fonts: # Use previously cached metadata, since the file size and # last modified timestamp have not changed. self.cached_fonts[fileid] = cached_fonts[fileid] continue try: self.read_font_metadata(candidate, fileid) except Exception as e: if DEBUG: prints('Failed to read metadata from font file:', candidate, as_unicode(e)) continue if frozenset(cached_fonts) != frozenset(self.cached_fonts): # Write out the cache only if some font files have changed self.write_cache() self.build_families()
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=90): # {{{ from calibre.ebooks.chardet import xml_to_unicode from HTMLParser import HTMLParser from lxml import etree, html if not self.is_configured(): return query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: err = u'Insufficient metadata to construct query' log.error(err) return err try: raw = self.browser.open_novisit(query).read() except Exception as e: log.exception(u'Failed to make identify query: %r' % query) return as_unicode(e) try: doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0]) entries_block = doc.xpath(u'//div[@class="bSearchResult"]') if entries_block: entries = doc.xpath(u'//div[contains(@itemprop, "itemListElement")]') # for entry in entries: # log.debug('entries %s' % entree.tostring(entry)) metadata = self.get_metadata(log, entries, title, authors, identifiers) self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout) else: # Redirect page: trying to extract ozon_id from javascript data h = HTMLParser() entry_string = (h.unescape(unicode(etree.tostring(doc, pretty_print=True)))) id_title_pat = re.compile(u'products":\[{"id":(\d{7}),"name":"([а-яА-Я :\-0-9]+)') # result containing ozon_id and entry_title entry_info = re.search(id_title_pat, entry_string) ozon_id = entry_info.group(1) if entry_info else None entry_title = entry_info.group(2) if entry_info else None if ozon_id: metadata = self.to_metadata_for_single_entry(log, ozon_id, entry_title, authors) identifiers['ozon'] = ozon_id self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, cachedPagesDict={}) else: log.error('No SearchResults in Ozon.ru response found') except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e)
def cdb_run(ctx, rd, which, version): try: m = module_for_cmd(which) except ImportError: raise HTTPNotFound('No module named: {}'.format(which)) if not getattr(m, 'readonly', False): ctx.check_for_write_access(rd) if getattr(m, 'version', 0) != int(version): raise HTTPNotFound(('The module {} is not available in version: {}.' 'Make sure the version of calibre used for the' ' server and calibredb match').format(which, version)) db = get_library_data(ctx, rd, strict_library_id=True)[0] if ctx.restriction_for(rd, db): raise HTTPForbidden('Cannot use the command-line db interface with a user who has per library restrictions') raw = rd.read() ct = rd.inheaders.get('Content-Type', all=True) try: if MSGPACK_MIME in ct: args = msgpack_loads(raw) elif 'application/json' in ct: args = json_loads(raw) else: raise HTTPBadRequest('Only JSON or msgpack requests are supported') except Exception: raise HTTPBadRequest('args are not valid encoded data') if getattr(m, 'needs_srv_ctx', False): args = [ctx] + list(args) try: result = m.implementation(db, partial(ctx.notify_changes, db.backend.library_path), *args) except Exception as err: import traceback return {'err': as_unicode(err), 'tb': traceback.format_exc()} return {'result': result}
def parse_uri(uri, parse_query=True): scheme, authority, path = parse_request_uri(uri) if b'#' in path: raise HTTPSimpleResponse(httplib.BAD_REQUEST, "Illegal #fragment in Request-URI.") if scheme: try: scheme = scheme.decode('ascii') except ValueError: raise HTTPSimpleResponse(httplib.BAD_REQUEST, 'Un-decodeable scheme') path, qs = path.partition(b'?')[::2] if parse_query: try: query = MultiDict.create_from_query_string(qs) except Exception: raise HTTPSimpleResponse(httplib.BAD_REQUEST, 'Unparseable query string') else: query = None try: path = '%2F'.join(unquote(x).decode('utf-8') for x in quoted_slash.split(path)) except ValueError as e: raise HTTPSimpleResponse(httplib.BAD_REQUEST, as_unicode(e)) path = tuple(filter(None, (x.replace('%2F', '/') for x in path.split('/')))) return scheme, path, query
def do_bind(self): # Get the correct address family for our host (allows IPv6 addresses) host, port = self.bind_address try: info = socket.getaddrinfo(host, port, socket.AF_UNSPEC, socket.SOCK_STREAM, 0, socket.AI_PASSIVE) except socket.gaierror: if ":" in host: info = [(socket.AF_INET6, socket.SOCK_STREAM, 0, "", self.bind_address + (0, 0))] else: info = [(socket.AF_INET, socket.SOCK_STREAM, 0, "", self.bind_address)] self.socket = None msg = "No socket could be created" for res in info: af, socktype, proto, canonname, sa = res try: self.bind(af, socktype, proto) except socket.error as serr: msg = "%s -- (%s: %s)" % (msg, sa, as_unicode(serr)) if self.socket: self.socket.close() self.socket = None continue break if not self.socket: raise socket.error(msg)
def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): # {{{ cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.debug('No cached cover found, running identify') rq = Queue() self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers) if abort.is_set(): return results = [] while True: try: results.append(rq.get_nowait()) except Empty: break results.sort(key=self.identify_results_keygen(title=title, authors=authors, identifiers=identifiers)) for mi in results: cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break if cached_url is None: log.info('No cover found') return if abort.is_set(): return log.debug('Downloading cover from:', cached_url) try: cdata = self.browser.open_novisit(cached_url, timeout=timeout).read() if cdata: result_queue.put((self, cdata)) except Exception as e: log.exception(u'Failed to download cover from: %s' % cached_url) return as_unicode(e)
def books_prepared(self, view, job): self.bpd.hide() self.bpd = None if job.exception is not None: self.gui.device_job_exception(job) return paths = job.result ok_paths = [x for x in paths if isinstance(x, basestring)] failed_paths = [x for x in paths if isinstance(x, tuple)] if failed_paths: if not ok_paths: msg = _('Could not download files from the device') typ = error_dialog else: msg = _('Could not download some files from the device') typ = warning_dialog det_msg = [x[0]+ '\n ' + as_unicode(x[1]) for x in failed_paths] det_msg = '\n\n'.join(det_msg) typ(self.gui, _('Could not download files'), msg, det_msg=det_msg, show=True) if ok_paths: from calibre.gui2.add import Adder callback = partial(self._add_from_device_adder, on_card=None, model=view.model()) Adder(ok_paths, db=self.gui.current_db, parent=self.gui, callback=callback, pool=self.gui.spare_pool())
def _write_order(self): if hasattr(self, 'items'): try: with open(os.path.join(self.location, 'order'), 'wb') as f: f.write(cPickle.dumps(tuple(map(hash, self.items)), -1)) except EnvironmentError as err: self.log('Failed to save thumbnail cache order:', as_unicode(err))
def get_books(ctx, rd): ''' Get books for the specified query Optional: ?library_id=<default library>&num=50&sort=timestamp.desc&search='' ''' library_id, db, sorts, orders = get_basic_query_data(ctx, rd.query) try: num = int(rd.query.get('num', DEFAULT_NUMBER_OF_BOOKS)) except Exception: raise HTTPNotFound('Invalid number of books: %r' % rd.query.get('num')) searchq = rd.query.get('search', '') db = get_library_data(ctx, rd.query)[0] ans = {} mdata = ans['metadata'] = {} with db.safe_read_lock: try: ans['search_result'] = search_result(ctx, rd, db, searchq, num, 0, ','.join(sorts), ','.join(orders)) except ParseException as err: # This must not be translated as it is used by the front end to # detect invalid search expressions raise HTTPBadRequest('Invalid search expression: %s' % as_unicode(err)) for book_id in ans['search_result']['book_ids']: data = book_as_json(db, book_id) if data is not None: mdata[book_id] = data return ans
def open(self, connected_device, library_uuid): self.dev = self._filesystem_cache = None try: self.dev = self.create_device(connected_device) except Exception as e: self.blacklisted_devices.add(connected_device) raise OpenFailed('Failed to open %s: Error: %s'%( connected_device, as_unicode(e))) storage = sorted(self.dev.storage_info, key=operator.itemgetter('id')) storage = [x for x in storage if x.get('rw', False)] if not storage: self.blacklisted_devices.add(connected_device) raise OpenFailed('No storage found for device %s'%(connected_device,)) snum = self.dev.serial_number if snum in self.prefs.get('blacklist', []): self.blacklisted_devices.add(connected_device) self.dev = None raise BlacklistedDevice( 'The %s device has been blacklisted by the user'%(connected_device,)) self._main_id = storage[0]['id'] self._carda_id = self._cardb_id = None if len(storage) > 1: self._carda_id = storage[1]['id'] if len(storage) > 2: self._cardb_id = storage[2]['id'] self.current_friendly_name = self.dev.friendly_name if not self.current_friendly_name: self.current_friendly_name = self.dev.model_name or _('Unknown MTP device') self.current_serial_num = snum self.currently_connected_dev = connected_device
def start(self): self.is_running = False self.exception = None cherrypy.tree.mount(root=None, config=self.config) try: self.start_cherrypy() except Exception as e: self.exception = e import traceback traceback.print_exc() if callable(self.start_failure_callback): try: self.start_failure_callback(as_unicode(e)) except: pass return try: self.is_running = True self.notify_listener() cherrypy.engine.block() except Exception as e: import traceback traceback.print_exc() self.exception = e finally: self.is_running = False self.notify_listener()
def parse(self, xml_detail): title = self.parse_title(xml_detail) authors = self.parse_authors(xml_detail) comments = self.parse_comments(xml_detail) rating = self.parse_rating(xml_detail) isbn = self.parse_isbn(xml_detail) publisher = self.parse_publisher(xml_detail) pub_year = self.parse_pubdate(xml_detail) tags = self.parse_tags(xml_detail) serie, serie_index = self.parse_serie(xml_detail) cover = self.parse_cover(xml_detail) if title is not None and authors is not None: mi = Metadata(title, authors) mi.languages = {'ces'} mi.comments = as_unicode(comments) mi.identifiers = {self.plugin.name:str(self.number)} mi.rating = rating mi.tags = tags mi.publisher = publisher mi.pubdate = pub_year mi.isbn = isbn mi.series = serie mi.series_index = serie_index mi.cover_url = cover if cover: self.plugin.cache_identifier_to_cover_url(str(self.number), cover) return mi else: return None
def run(self): if self.xml is None: raw = None url = None try: url = self.plugin.create_query(self.title, self.authors, self.number) self.log('download page search %s'%url) raw = self.plugin.browser.open(url, timeout=self.timeout).read().strip() except Exception as e: self.log.exception('Failed to make identify query: %r'%url) return as_unicode(e) if raw is not None: try: parser = etree.HTMLParser() clean = clean_ascii_chars(raw) self.xml = fromstring(clean, parser=parser) # if len(parser.error_log) > 0: #some errors while parsing # self.log('while parsing page occus some errors:') # self.log(parser.error_log) except Exception as e: self.log.exception('Failed to parse xml for url: %s'%url) self.parse()
def books_prepared(self, view, job): self.bpd.hide() self.bpd = None if job.exception is not None: self.gui.device_job_exception(job) return paths = job.result ok_paths = [x for x in paths if isinstance(x, basestring)] failed_paths = [x for x in paths if isinstance(x, tuple)] if failed_paths: if not ok_paths: msg = _("Could not download files from the device") typ = error_dialog else: msg = _("Could not download some files from the device") typ = warning_dialog det_msg = [x[0] + "\n " + as_unicode(x[1]) for x in failed_paths] det_msg = "\n\n".join(det_msg) typ(self.gui, _("Could not download files"), msg, det_msg=det_msg, show=True) if ok_paths: from calibre.gui2.add import Adder self.__adder_func = partial(self._add_from_device_adder, on_card=None, model=view.model()) self._adder = Adder( self.gui, self.gui.library_view.model().db, self.Dispatcher(self.__adder_func), spare_server=self.gui.spare_server, ) self._adder.add(ok_paths)
def identify(self, log, result_queue, abort, title, authors, identifiers={}, timeout=30): ''' Note this method will retry without identifiers automatically if no match is found with identifiers. ''' matches = [] databazeknih_id = identifiers.get('databazeknih', None) log.info(u'\nTitl1e:%s\nAuthors:%s\n'%(title, authors)) br = browser() if databazeknih_id: matches.append(databazeknih.BASE_URL + 'knihy/' + databazeknih_id) else: query = self.create_query(log, title=title, authors=authors) if query is None: log.error('Insufficient metadata to construct query') return try: log.info(u'Querying: %s'%query) response = br.open(query) except Exception as e: isbn = check_isbn(identifiers.get('isbn', None)) if isbn and callable(getattr(e, 'getcode', None)) and e.getcode() == 404: log.info('Failed to find match for ISBN: %s'%isbn) else: err = 'Failed to make identify query: %r'%query log.info(err) return as_unicode(e) # return e try: raw = response.read().strip() raw = raw.decode('utf-8', errors='replace') if not raw: log.error('Failed to get raw result for query: %r'%query) return root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse databazeknih page for query: %r'%query log.exception(msg) return msg self._parse_search_results(log, title, authors, root, matches, timeout) if abort.is_set(): return if not matches: if identifiers and title and authors: log.info('No matches found with identifiers, retrying using only' ' title and authors') return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) log.error('No matches found with query: %r'%query) return log.debug('Starting workers for: %s' % (matches,)) from calibre_plugins.databazeknih.worker import Worker workers = [Worker(url, result_queue, br, log, i, self) for i, url in enumerate(matches)] for w in workers: w.start() time.sleep(0.1) while not abort.is_set(): a_worker_is_alive = False for w in workers: w.join(0.2) if abort.is_set(): break if w.is_alive(): a_worker_is_alive = True if not a_worker_is_alive: break return None
def dump_and_restore(self): try: self.db.dump_and_restore(self.update_msg.emit) except Exception as e: import traceback self.error = (as_unicode(e), traceback.format_exc())
def subset_all_fonts(container, font_stats, report): remove = set() total_old = total_new = 0 changed = False for name, mt in container.mime_map.iteritems(): if (mt in OEB_FONTS or name.rpartition('.')[-1].lower() in {'otf', 'ttf'}) and mt != guess_type('a.woff'): chars = font_stats.get(name, set()) with container.open(name, 'rb') as f: f.seek(0, os.SEEK_END) total_old += f.tell() if not chars: remove.add(name) report('Removed unused font: %s' % name) continue with container.open(name, 'r+b') as f: raw = f.read() font_name = get_font_names(raw)[-1] warnings = [] container.log('Subsetting font: %s' % (font_name or name)) try: nraw, old_sizes, new_sizes = subset(raw, chars, warnings=warnings) except UnsupportedFont as e: container.log.warning( 'Unsupported font: %s, ignoring. Error: %s' % (name, as_unicode(e))) continue for w in warnings: container.log.warn(w) olen = sum(old_sizes.itervalues()) nlen = sum(new_sizes.itervalues()) total_new += len(nraw) if nlen == olen: report('The font %s was already subset' % font_name) else: report( 'Decreased the font %s to %.1f%% of its original size' % (font_name, nlen / olen * 100)) changed = True f.seek(0), f.truncate(), f.write(nraw) for name in remove: container.remove_item(name) changed = True if remove: for name, mt in container.mime_map.iteritems(): if mt in OEB_STYLES: sheet = container.parsed(name) if remove_font_face_rules(container, sheet, remove, name): container.dirty(name) elif mt in OEB_DOCS: for style in XPath('//h:style')(container.parsed(name)): if style.get('type', 'text/css') == 'text/css' and style.text: sheet = container.parse_css(style.text, name) if remove_font_face_rules(container, sheet, remove, name): style.text = sheet.cssText container.dirty(name) if total_old > 0: report('Reduced total font size to %.1f%% of original' % (total_new / total_old * 100)) else: report('No embedded fonts found') return changed
def _do_delete(self, path): try: os.remove(path) except EnvironmentError as err: self.log('Failed to delete cached thumbnail file:', as_unicode(err))
def tweak(ebook_file): ''' Command line interface to the Tweak Book tool ''' fmt = ebook_file.rpartition('.')[-1].lower() exploder, rebuilder = get_tools(fmt) if exploder is None: prints( 'Cannot tweak %s files. Supported formats are: EPUB, HTMLZ, AZW3, MOBI', file=sys.stderr) raise SystemExit(1) with TemporaryDirectory( '_tweak_' + os.path.basename(ebook_file).rpartition('.')[0]) as tdir: try: opf = exploder(ebook_file, tdir, question=ask_cli_question) except WorkerError as e: prints('Failed to unpack', ebook_file) prints(e.orig_tb) raise SystemExit(1) except Error as e: prints(as_unicode(e), file=sys.stderr) raise SystemExit(1) if opf is None: # The question was answered with No return ed = os.environ.get('EDITOR', 'dummy') cmd = shlex.split(ed) isvim = bool([x for x in cmd[0].split('/') if x.endswith('vim')]) proceed = False prints('Book extracted to', tdir) if not isvim: prints('Make your tweaks and once you are done,', __appname__, 'will rebuild', ebook_file, 'from', tdir) print() proceed = ask_cli_question('Rebuild ' + ebook_file + '?') else: base = os.path.basename(ebook_file) with TemporaryFile(base + '.zip') as zipf: with ZipFile(zipf, 'w') as zf: zf.add_dir(tdir) try: subprocess.check_call(cmd + [zipf]) except: prints(ed, 'failed, aborting...') raise SystemExit(1) with ZipFile(zipf, 'r') as zf: shutil.rmtree(tdir) os.mkdir(tdir) zf.extractall(path=tdir) proceed = True if proceed: prints('Rebuilding', ebook_file, 'please wait ...') try: rebuilder(tdir, ebook_file) except WorkerError as e: prints('Failed to rebuild', ebook_file) prints(e.orig_tb) raise SystemExit(1) prints(ebook_file, 'successfully tweaked')
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=90): # {{{ from calibre.ebooks.chardet import xml_to_unicode from HTMLParser import HTMLParser from lxml import etree, html import json if not self.is_configured(): return query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: err = u'Insufficient metadata to construct query' log.error(err) return err try: raw = self.browser.open_novisit(query).read() except Exception as e: log.exception(u'Failed to make identify query: %r' % query) return as_unicode(e) try: doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0]) entries_block = doc.xpath(u'//div[@class="bSearchResult"]') # log.debug(u'HTML: %s' % xml_to_unicode(raw, verbose=True)[0]) if entries_block: entries = doc.xpath( u'//div[contains(@itemprop, "itemListElement")]') # log.debug(u'entries_block') # for entry in entries: # log.debug('entries %s' % entree.tostring(entry)) metadata = self.get_metadata(log, entries, title, authors, identifiers) self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout) else: # Redirect page: trying to extract ozon_id from javascript data h = HTMLParser() entry_string = (h.unescape( etree.tostring(doc, pretty_print=True, encoding='unicode'))) json_pat = re.compile(r'dataLayer\s*=\s*(.+)?;') json_info = re.search(json_pat, entry_string) jsondata = json_info.group(1) if json_info else None if jsondata: idx = jsondata.rfind('}]') if idx > 0: jsondata = jsondata[:idx + 2] # log.debug(u'jsondata: %s' % jsondata) dataLayer = json.loads(jsondata) if jsondata else None ozon_id = None if dataLayer and dataLayer[0] and 'ecommerce' in dataLayer[0]: jsproduct = dataLayer[0]['ecommerce']['detail'][ 'products'][0] ozon_id = as_unicode(jsproduct['id']) entry_title = as_unicode(jsproduct['name']) log.debug(u'ozon_id %s' % ozon_id) log.debug(u'entry_title %s' % entry_title) if ozon_id: metadata = self.to_metadata_for_single_entry( log, ozon_id, entry_title, authors) identifiers['ozon'] = ozon_id self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, cachedPagesDict={}) if not ozon_id: log.error('No SearchResults in Ozon.ru response found!') except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e)
def __init__(self, tree, path, oeb, opts, profile=None, extra_css='', user_css='', base_css=''): self.oeb, self.opts = oeb, opts self.profile = profile if self.profile is None: # Use the default profile. This should really be using # opts.output_profile, but I don't want to risk changing it, as # doing so might well have hard to debug font size effects. from calibre.customize.ui import output_profiles for x in output_profiles(): if x.short_name == 'default': self.profile = x break if self.profile is None: # Just in case the default profile is removed in the future :) self.profile = opts.output_profile self.body_font_size = self.profile.fbase self.logger = oeb.logger item = oeb.manifest.hrefs[path] basename = os.path.basename(path) cssname = os.path.splitext(basename)[0] + '.css' stylesheets = [html_css_stylesheet()] if base_css: stylesheets.append(parseString(base_css, validate=False)) style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]') # Add cssutils parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: cssprofiles.addProfile(profile['name'], profile['props'], profile['macros']) parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) self.font_face_rules = [] for elem in style_tags: if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES and media_ok(elem.get('media'))): text = elem.text if elem.text else u'' for x in elem: t = getattr(x, 'text', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') t = getattr(x, 'tail', None) if t: text += u'\n\n' + force_unicode(t, u'utf-8') if text: text = oeb.css_preprocessor(text) # We handle @import rules separately parser.setFetcher(lambda x: ('utf-8', b'')) stylesheet = parser.parseString(text, href=cssname, validate=False) parser.setFetcher(self._fetch_css_file) for rule in stylesheet.cssRules: if rule.type == rule.IMPORT_RULE: ihref = item.abshref(rule.href) if not media_ok(rule.media.mediaText): continue hrefs = self.oeb.manifest.hrefs if ihref not in hrefs: self.logger.warn( 'Ignoring missing stylesheet in @import rule:', rule.href) continue sitem = hrefs[ihref] if sitem.media_type not in OEB_STYLES: self.logger.warn( 'CSS @import of non-CSS file %r' % rule.href) continue stylesheets.append(sitem.data) for rule in tuple( stylesheet.cssRules.rulesOfType( CSSRule.PAGE_RULE)): stylesheet.cssRules.remove(rule) # Make links to resources absolute, since these rules will # be folded into a stylesheet at the root replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) elif (elem.tag == XHTML('link') and elem.get('href') and elem.get('rel', 'stylesheet').lower() == 'stylesheet' and elem.get('type', CSS_MIME).lower() in OEB_STYLES and media_ok(elem.get('media'))): href = urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: self.logger.warn( 'Stylesheet %r referenced by file %r not in manifest' % (path, item.href)) continue if not hasattr(sitem.data, 'cssRules'): self.logger.warn( 'Stylesheet %r referenced by file %r is not CSS' % (path, item.href)) continue stylesheets.append(sitem.data) csses = {'extra_css': extra_css, 'user_css': user_css} for w, x in csses.items(): if x: try: text = x stylesheet = parser.parseString(text, href=cssname, validate=False) stylesheets.append(stylesheet) except: self.logger.exception('Failed to parse %s, ignoring.' % w) self.logger.debug('Bad css: ') self.logger.debug(x) rules = [] index = 0 self.stylesheets = set() self.page_rule = {} for sheet_index, stylesheet in enumerate(stylesheets): href = stylesheet.href self.stylesheets.add(href) for rule in stylesheet.cssRules: if rule.type == rule.MEDIA_RULE: if media_ok(rule.media.mediaText): for subrule in rule.cssRules: rules.extend( self.flatten_rule( subrule, href, index, is_user_agent_sheet=sheet_index == 0)) index += 1 else: rules.extend( self.flatten_rule( rule, href, index, is_user_agent_sheet=sheet_index == 0)) index = index + 1 rules.sort() self.rules = rules self._styles = {} pseudo_pat = re.compile( ur':{1,2}(%s)' % ('|'.join(INAPPROPRIATE_PSEUDO_CLASSES)), re.I) select = Select(tree, ignore_inappropriate_pseudo_classes=True) for _, _, cssdict, text, _ in rules: fl = pseudo_pat.search(text) try: matches = tuple(select(text)) except SelectorError as err: self.logger.error( 'Ignoring CSS rule with invalid selector: %r (%s)' % (text, as_unicode(err))) continue if fl is not None: fl = fl.group(1) if fl == 'first-letter' and getattr( self.oeb, 'plumber_output_format', '').lower() in {u'mobi', u'docx'}: # Fake first-letter from lxml.builder import ElementMaker E = ElementMaker(namespace=XHTML_NS) for elem in matches: for x in elem.iter('*'): if x.text: punctuation_chars = [] text = unicode(x.text) while text: category = unicodedata.category(text[0]) if category[0] not in {'P', 'Z'}: break punctuation_chars.append(text[0]) text = text[1:] special_text = u''.join(punctuation_chars) + \ (text[0] if text else u'') span = E.span(special_text) span.set('data-fake-first-letter', '1') span.tail = text[1:] x.text = None x.insert(0, span) self.style(span)._update_cssdict(cssdict) break else: # Element pseudo-class for elem in matches: self.style(elem)._update_pseudo_class(fl, cssdict) else: for elem in matches: self.style(elem)._update_cssdict(cssdict) for elem in xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'[0-9.]+$') for elem in xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ style._style.get('height', 'auto') != 'auto' if not is_styled: # Update img style dimension using width and height upd = {} for prop in ('width', 'height'): val = elem.get(prop, '').strip() try: del elem.attrib[prop] except: pass if val: if num_pat.match(val) is not None: val += 'px' upd[prop] = val if upd: style._update_cssdict(upd)
def load_ebook(self, pathtoebook, open_at=None, reopen_at=None): if self.iterator is not None: self.save_current_position() self.iterator.__exit__() self.iterator = EbookIterator(pathtoebook) self.history.clear() self.open_progress_indicator(_('Loading ebook...')) worker = Worker( target=partial(self.iterator.__enter__, view_kepub=True)) worker.start() while worker.isAlive(): worker.join(0.1) QApplication.processEvents() if worker.exception is not None: if isinstance(worker.exception, DRMError): from calibre.gui2.dialogs.drm_error import DRMErrorMessage DRMErrorMessage(self).exec_() else: r = getattr(worker.exception, 'reason', worker.exception) error_dialog(self, _('Could not open ebook'), as_unicode(r) or _('Unknown error'), det_msg=worker.traceback, show=True) self.close_progress_indicator() else: self.metadata.show_opf(self.iterator.opf, self.iterator.book_format) self.view.current_language = self.iterator.language title = self.iterator.opf.title if not title: title = os.path.splitext(os.path.basename(pathtoebook))[0] if self.iterator.toc: self.toc_model = TOC(self.iterator.spine, self.iterator.toc) self.toc.setModel(self.toc_model) if self.show_toc_on_open: self.action_table_of_contents.setChecked(True) else: self.toc_model = TOC(self.iterator.spine) self.toc.setModel(self.toc_model) self.action_table_of_contents.setChecked(False) if isbytestring(pathtoebook): pathtoebook = force_unicode(pathtoebook, filesystem_encoding) vh = vprefs.get('viewer_open_history', []) try: vh.remove(pathtoebook) except: pass vh.insert(0, pathtoebook) vprefs.set('viewer_open_history', vh[:50]) self.build_recent_menu() self.footnotes_dock.close() self.action_table_of_contents.setDisabled(not self.iterator.toc) self.current_book_has_toc = bool(self.iterator.toc) self.current_title = title self.setWindowTitle(title + ' [%s]' % self.iterator.book_format + ' - ' + self.base_window_title) self.pos.setMaximum(sum(self.iterator.pages)) self.pos.setSuffix(' / %d' % sum(self.iterator.pages)) self.vertical_scrollbar.setMinimum(100) self.vertical_scrollbar.setMaximum(100 * sum(self.iterator.pages)) self.vertical_scrollbar.setSingleStep(10) self.vertical_scrollbar.setPageStep(100) self.set_vscrollbar_value(1) self.current_index = -1 QApplication.instance().alert(self, 5000) previous = self.set_bookmarks(self.iterator.bookmarks) if reopen_at is not None: previous = reopen_at if open_at is None and previous is not None: self.goto_bookmark(previous) else: if open_at is None: self.next_document() else: if open_at > self.pos.maximum(): open_at = self.pos.maximum() if open_at < self.pos.minimum(): open_at = self.pos.minimum() self.goto_page(open_at, loaded_check=False)
def tick(self): now = monotonic() read_needed, write_needed, readable, remove, close_needed = [], [], [], [], [] has_ssl = self.ssl_context is not None for s, conn in self.connection_map.iteritems(): if now - conn.last_activity > self.opts.timeout: if conn.handle_timeout(): conn.last_activity = now else: remove.append((s, conn)) continue wf = conn.wait_for if wf is READ or wf is RDWR: if wf is RDWR: write_needed.append(s) if conn.read_buffer.has_data: readable.append(s) else: if has_ssl: conn.drain_ssl_buffer() if conn.ready: (readable if conn.read_buffer.has_data else read_needed).append(s) else: close_needed.append((s, conn)) else: read_needed.append(s) elif wf is WRITE: write_needed.append(s) for s, conn in remove: self.log('Closing connection because of extended inactivity: %s' % conn.state_description) self.close(s, conn) for x, conn in close_needed: self.close(s, conn) if readable: writable = [] else: try: readable, writable, _ = select.select([self.socket.fileno(), self.control_out.fileno()] + read_needed, write_needed, [], self.opts.timeout) except ValueError: # self.socket.fileno() == -1 self.ready = False self.log.error('Listening socket was unexpectedly terminated') return except (select.error, socket.error) as e: # select.error has no errno attribute. errno is instead # e.args[0] if getattr(e, 'errno', e.args[0]) in socket_errors_eintr: return for s, conn in tuple(self.connection_map.iteritems()): try: select.select([s], [], [], 0) except (select.error, socket.error) as e: if getattr(e, 'errno', e.args[0]) not in socket_errors_eintr: self.close(s, conn) # Bad socket, discard return if not self.ready: return ignore = set() for s, conn, event in self.get_actions(readable, writable): if s in ignore: continue try: conn.handle_event(event) if not conn.ready: self.close(s, conn) except JobQueueFull: self.log.exception('Server busy handling request: %s' % conn.state_description) if conn.ready: if conn.response_started: self.close(s, conn) else: try: conn.report_busy() except Exception: self.close(s, conn) except Exception as e: ignore.add(s) ssl_terminated = getattr(conn, 'ssl_terminated', False) if ssl_terminated: self.log.warn('Client tried to initiate SSL renegotiation, closing connection') self.close(s, conn) else: self.log.exception('Unhandled exception in state: %s' % conn.state_description) if conn.ready: if conn.response_started: self.close(s, conn) else: try: conn.report_unhandled_exception(e, traceback.format_exc()) except Exception: self.close(s, conn) else: self.log.error('Error in SSL handshake, terminating connection: %s' % as_unicode(e)) self.close(s, conn)
def upgrade_connection_to_ws(self, buf, inheaders, event): if self.write(buf): if self.websocket_handler is None: self.websocket_handler = DummyHandler() self.read_frame, self.current_recv_opcode = ReadFrame(), None self.in_websocket_mode = True try: self.websocket_handler.handle_websocket_upgrade(self.websocket_connection_id, weakref.ref(self), inheaders) except Exception as err: self.log.exception('Error in WebSockets upgrade handler:') self.websocket_close(UNEXPECTED_ERROR, 'Unexpected error in handler: %r' % as_unicode(err)) self.handle_event = self.ws_duplex self.set_ws_state() self.end_send_optimization()
def identify( # {{{ self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30 ): from lxml import etree entry = XPath('//atom:entry') query = self.create_query( log, title=title, authors=authors, identifiers=identifiers ) if not query: log.error('Insufficient metadata to construct query') return br = self.browser log('Making query:', query) try: raw = br.open_novisit(query, timeout=timeout).read() except Exception as e: log.exception('Failed to make identify query: %r' % query) return as_unicode(e) try: parser = etree.XMLParser(recover=True, no_network=True) feed = etree.fromstring( xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=parser ) entries = entry(feed) except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) if not entries and title and not abort.is_set(): if identifiers: log('No results found, retrying without identifiers') return self.identify( log, result_queue, abort, title=title, authors=authors, timeout=timeout ) ntitle = cleanup_title(title) if ntitle and ntitle != title: log('No results found, retrying without sub-title') return self.identify( log, result_queue, abort, title=ntitle, authors=authors, timeout=timeout ) # There is no point running these queries in threads as google # throttles requests returning 403 Forbidden errors self.get_all_details(br, log, entries, abort, result_queue, timeout)
def parse_request_line(self, buf, event, first=False): # {{{ line = self.readline(buf) if line is None: return if line == b'\r\n': # Ignore a single leading empty line, as per RFC 2616 sec 4.1 if first: return self.set_state(READ, self.parse_request_line, Accumulator()) return self.simple_response( httplib.BAD_REQUEST, 'Multiple leading empty lines not allowed') try: method, uri, req_protocol = line.strip().split(b' ', 2) rp = int(req_protocol[5]), int(req_protocol[7]) self.method = method.decode('ascii').upper() except Exception: return self.simple_response(httplib.BAD_REQUEST, "Malformed Request-Line") if self.method not in HTTP_METHODS: return self.simple_response(httplib.BAD_REQUEST, "Unknown HTTP method") try: self.request_protocol = protocol_map[rp] except KeyError: return self.simple_response(httplib.HTTP_VERSION_NOT_SUPPORTED) self.response_protocol = protocol_map[min((1, 1), rp)] scheme, authority, path = parse_request_uri(uri) if b'#' in path: return self.simple_response(httplib.BAD_REQUEST, "Illegal #fragment in Request-URI.") if scheme: try: self.scheme = scheme.decode('ascii') except ValueError: return self.simple_response(httplib.BAD_REQUEST, 'Un-decodeable scheme') qs = b'' if b'?' in path: path, qs = path.split(b'?', 1) try: self.query = MultiDict.create_from_query_string(qs) except Exception: return self.simple_response(httplib.BAD_REQUEST, 'Unparseable query string') try: path = '%2F'.join( unquote(x).decode('utf-8') for x in quoted_slash.split(path)) except ValueError as e: return self.simple_response(httplib.BAD_REQUEST, as_unicode(e)) self.path = tuple( filter(None, (x.replace('%2F', '/') for x in path.split('/')))) self.header_line_too_long_error_code = httplib.REQUEST_ENTITY_TOO_LARGE self.request_line = line.rstrip() self.set_state(READ, self.parse_header_line, HTTPHeaderParser(), Accumulator())
def main(args=sys.argv): # Ensure viewer can continue to function if GUI is closed os.environ.pop('CALIBRE_WORKER_TEMP_DIR', None) reset_base_dir() scheme = QWebEngineUrlScheme(FAKE_PROTOCOL.encode('ascii')) scheme.setSyntax(QWebEngineUrlScheme.Syntax.Host) scheme.setFlags(QWebEngineUrlScheme.SecureScheme) QWebEngineUrlScheme.registerScheme(scheme) override = 'calibre-ebook-viewer' if islinux else None processed_args = [] internal_book_data = internal_book_data_path = None for arg in args: if arg.startswith('--internal-book-data='): internal_book_data_path = arg.split('=', 1)[1] continue processed_args.append(arg) if internal_book_data_path: try: with lopen(internal_book_data_path, 'rb') as f: internal_book_data = json.load(f) finally: try: os.remove(internal_book_data_path) except EnvironmentError: pass args = processed_args app = Application(args, override_program_name=override, windows_app_uid=VIEWER_APP_UID) parser = option_parser() opts, args = parser.parse_args(args) oat = opts.open_at if oat and not (oat.startswith('toc:') or oat.startswith('toc-href:') or oat.startswith('toc-href-contains:') or oat.startswith('epubcfi(/') or is_float(oat) or oat.startswith('ref:')): raise SystemExit('Not a valid --open-at value: {}'.format( opts.open_at)) listener = None if get_session_pref('singleinstance', False): try: listener = ensure_single_instance(args, opts.open_at) except Exception as e: import traceback error_dialog(None, _('Failed to start viewer'), as_unicode(e), det_msg=traceback.format_exc(), show=True) raise SystemExit(1) acc = EventAccumulator(app) app.file_event_hook = acc app.load_builtin_fonts() app.setWindowIcon(QIcon(I('viewer.png'))) migrate_previous_viewer_prefs() main = EbookViewer(open_at=opts.open_at, continue_reading=opts.continue_reading, force_reload=opts.force_reload, calibre_book_data=internal_book_data) main.set_exception_handler() if len(args) > 1: acc.events.append(os.path.abspath(args[-1])) acc.got_file.connect(main.handle_commandline_arg) main.show() main.msg_from_anotherinstance.connect(main.another_instance_wants_to_talk, type=Qt.QueuedConnection) if listener is not None: t = Thread(name='ConnListener', target=listen, args=(listener, main.msg_from_anotherinstance)) t.daemon = True t.start() QTimer.singleShot(0, acc.flush) if opts.raise_window: main.raise_() if opts.full_screen: main.set_full_screen(True) app.exec_() if listener is not None: listener.close()
def _load_index(self): 'Load the index, automatically removing incorrectly sized thumbnails and pruning to fit max_size' try: os.makedirs(self.location) except OSError as err: if err.errno != errno.EEXIST: self.log('Failed to make thumbnail cache dir:', as_unicode(err)) self.total_size = 0 self.items = OrderedDict() order = self._read_order() def listdir(*args): try: return os.listdir(os.path.join(*args)) except EnvironmentError: return () # not a directory or no permission or whatever entries = ('/'.join((parent, subdir, entry)) for parent in listdir(self.location) for subdir in listdir(self.location, parent) for entry in listdir(self.location, parent, subdir)) invalidate = set() try: with open(os.path.join(self.location, 'invalidate'), 'rb') as f: raw = f.read() except EnvironmentError as err: if getattr(err, 'errno', None) != errno.ENOENT: self.log('Failed to read thumbnail invalidate data:', as_unicode(err)) else: try: os.remove(os.path.join(self.location, 'invalidate')) except EnvironmentError as err: self.log('Failed to remove thumbnail invalidate data:', as_unicode(err)) else: def record(line): try: uuid, book_id = line.partition(' ')[0::2] book_id = int(book_id) return (uuid, book_id) except Exception: return None invalidate = {record(x) for x in raw.splitlines()} items = [] try: for entry in entries: try: uuid, name = entry.split('/')[0::2] book_id, timestamp, size, thumbnail_size = name.split('-') book_id, timestamp, size = int(book_id), float( timestamp), int(size) thumbnail_size = tuple( map(int, thumbnail_size.partition('x')[0::2])) except (ValueError, TypeError, IndexError, KeyError, AttributeError): continue key = (uuid, book_id) path = os.path.join(self.location, entry) if self.thumbnail_size == thumbnail_size and key not in invalidate: items.append( (key, Entry(path, size, timestamp, thumbnail_size))) self.total_size += size else: self._do_delete(path) except EnvironmentError as err: self.log('Failed to read thumbnail cache dir:', as_unicode(err)) self.items = OrderedDict( sorted(items, key=lambda x: order.get(hash(x[0]), 0))) self._apply_size()
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): ''' .. note:: this method will retry without identifiers automatically if no match is found with identifiers. ''' matches = [] # Unlike the other metadata sources, if we have a shelfari id then we # do not need to fire a "search" at Shelfari.com. Instead we will be # able to go straight to the URL for that book. shelfari_id = identifiers.get('shelfari', None) isbn = check_isbn(identifiers.get('isbn', None)) br = self.browser if shelfari_id: matches.append('%s/books/%s' % (Shelfari.BASE_URL, shelfari_id)) else: query = self._create_query(log, title=title, authors=authors, identifiers=identifiers) if query is None: log.error('Insufficient metadata to construct query') return try: log.info('Querying: %s' % query) response = br.open_novisit(query, timeout=timeout) if isbn: # Check whether we got redirected to a book page for ISBN searches. # If we did, will use the url. # If we didn't then treat it as no matches on Shelfari location = response.geturl() if '/search/' not in location: log.info('ISBN match location: %r' % location) matches.append(location) except Exception as e: err = 'Failed to make identify query: %r' % query log.exception(err) return as_unicode(e) # For ISBN based searches we have already done everything we need to # So anything from this point below is for title/author based searches. if not isbn: try: raw = response.read().strip() #open('E:\\t.html', 'wb').write(raw) raw = raw.decode('utf-8', errors='replace') if not raw: log.error('Failed to get raw result for query: %r' % query) return root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse shelfari page for query: %r' % query log.exception(msg) return msg # Now grab the first value from the search results, provided the # title and authors appear to be for the same book self._parse_search_results(log, title, authors, root, matches, timeout) if abort.is_set(): return if not matches: # If there's no matches, normally we would try to query with less info, but shelfari's search is already fuzzy log.error('No matches found with query: %r' % query) return # Setup worker threads to look more thoroughly at matching books to extract information workers = [ Worker(url, result_queue, br, log, i, self) for i, url in enumerate(matches) ] # Start the workers and stagger them so we don't hammer shelfari :) for w in workers: w.start() time.sleep(0.1) while not abort.is_set(): a_worker_is_alive = False for w in workers: w.join(0.2) if abort.is_set(): break if w.is_alive(): a_worker_is_alive = True if not a_worker_is_alive: break return None
def parse_comments(self, root, raw): from urllib import unquote ans = '' ns = tuple(self.selector('#bookDescription_feature_div noscript')) if ns: ns = ns[0] if len(ns) == 0 and ns.text: import html5lib # html5lib parsed noscript as CDATA ns = html5lib.parseFragment('<div>%s</div>' % (ns.text), treebuilder='lxml', namespaceHTMLElements=False)[0] else: ns.tag = 'div' ans = self._render_comments(ns) else: desc = root.xpath('//div[@id="ps-content"]/div[@class="content"]') if desc: ans = self._render_comments(desc[0]) desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]') if desc: ans += self._render_comments(desc[0]) else: # Idiot chickens from amazon strike again. This data is now stored # in a JS variable inside a script tag URL encoded. m = re.search(b'var\s+iframeContent\s*=\s*"([^"]+)"', raw) if m is not None: try: text = unquote(m.group(1)).decode('utf-8') nr = html5lib.parse(text, treebuilder='lxml', namespaceHTMLElements=False) desc = nr.xpath('//div[@id="productDescription"]/*[@class="content"]') if desc: ans += self._render_comments(desc[0]) except Exception as e: self.log.warn('Parsing of obfuscated product description failed with error: %s' % as_unicode(e)) return ans
def mobi_exploder(path, tdir, question=lambda x: True): from calibre.ebooks.mobi.tweak import explode, BadFormat try: return explode(path, tdir, question=question) except BadFormat as e: raise Error(as_unicode(e))
def identify( self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): import json br = self.browser br.addheaders = [ ('Referer', 'https://www.edelweiss.plus/'), ('X-Requested-With', 'XMLHttpRequest'), ('Cache-Control', 'no-cache'), ('Pragma', 'no-cache'), ] if 'edelweiss' in identifiers: items = [identifiers['edelweiss']] else: log.error( 'Currently Edelweiss returns random books for search queries') return query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: log.error('Insufficient metadata to construct query') return log('Using query URL:', query) try: raw = br.open(query, timeout=timeout).read().decode('utf-8') except Exception as e: log.exception('Failed to make identify query: %r' % query) return as_unicode(e) items = re.search(r'window[.]items\s*=\s*(.+?);', raw) if items is None: log.error('Failed to get list of matching items') log.debug('Response text:') log.debug(raw) return items = json.loads(items.group(1)) if (not items and identifiers and title and authors and not abort.is_set()): return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) if not items: return workers = [] items = items[:5] for i, item in enumerate(get_basic_data(self.browser, log, *items)): sku = item['sku'] for isbn in item['isbns']: self.cache_isbn_to_identifier(isbn, sku) if item['cover']: self.cache_identifier_to_cover_url(sku, item['cover']) fmt = item['format'].lower() if 'audio' in fmt or 'mp3' in fmt: continue # Audio-book, ignore workers.append( Worker(item, i, result_queue, br.clone_browser(), timeout, log, self)) if not workers: return for w in workers: w.start() # Don't send all requests at the same time time.sleep(0.1) while not abort.is_set(): a_worker_is_alive = False for w in workers: w.join(0.2) if abort.is_set(): break if w.is_alive(): a_worker_is_alive = True if not a_worker_is_alive: break
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, nested=False): matches = [] ipython(locals()) CBDB_id = identifiers.get('cbdb', None) isbn = check_isbn(identifiers.get('isbn', None)) br = self.browser if CBDB_id: matches.append(BASE_BOOK_URL % (BASE_URL, CBDB_id)) else: query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if query is None: log.error('Insufficient metadata to construct query') return try: log.info('Querying: %s' % query) response = br.open_novisit(query, timeout=timeout) if isbn: # Check whether we got redirected to a book page for ISBN searches. # If we did, will use the url. # If we didn't then treat it as no matches on CBDB location = response.geturl() if '/kniha-' in location: log.info('ISBN match location: %r' % location) matches.append(location) except IOError as e: err = 'Connection problem. Check your Internet connection' log.warning(err) return as_unicode(e) except Exception as e: err = 'Failed to make identify query: %r' % query # testing w/o inet log.exception(err) return as_unicode(e) # For ISBN based searches we have already done everything we need to # So anything from this point below is for title/author based searches. # CBDB doesn't redirect anymore when there's just one match if not isbn or (isbn and matches.__len__() == 0): try: raw = response.read().strip() # open('E:\\t.html', 'wb').write(raw) # raw = open('S:\\t.html', 'rb').read() raw = raw.decode('utf-8', errors='replace') if not raw: log.error('Failed to get raw result for query: %r' % query) return cln = clean_ascii_chars(raw) idxs = cln.find('<!DOCTYPE') if (idxs == -1): log.error('Failed to find HTML document') return vld = cln[idxs:] # log.info(vld) idxs = vld.find("<head>") if (idxs == -1): log.error('Failed to find HEAD element') return # <!DOCTYPE .. <head> hdr = vld[:idxs] idxs = vld.find('<h2>Nalezeno') if (idxs == -1): log.error('Incorrect document structure 1') return idxe = vld.find('</h2>', idxs) if (idxe == -1): log.error('Incorrect document structure 2') return arr = vld[idxs:idxe].split(':') if (arr.__len__() != 2): log.error('Incorrect document structure 3') return cnt = int(arr[1]) # a publication found if (cnt != 0): hdr += '<HEAD/>' + '<BODY>' + \ '<H3>' + str(cnt) + '</H3>' idxs = vld.find('<table', idxe) if (idxs == -1): log.error('Incorrect document structure 11') return idxe = vld.find('</table>', idxs) if (idxe == -1): log.error('Incorrect document structure 12') return hdr += vld[idxs:(idxe + 8)] + '</BODY>' + '</HTML>' # rebuild HTML to contain just relevant data # first line ~ result count # table ~ results vld = hdr else: # nothing found, so send an empty HTML vld = '<HTML/>' # log.info('vld') # log.info(vld) root = fromstring(vld) except: msg = 'Failed to parse CBDB page for query: %r' % query log.exception(msg) return msg # Now grab values from the search results, provided the # title and authors appear to be for the same book # isnb of course will only have one result if isbn: self._parse_isbn_search_results(log, root, matches) else: self._parse_search_results(log, title, authors, root, matches, timeout) if abort.is_set(): return if (matches.__len__() == 0): if nested: return log.info('No matches found, trying to strip accents') if (not self.identify(log, result_queue, abort, title=self.strip_accents(title), authors=self.strip_accents(authors), timeout=30, nested=True)): log.info('No matches found, trying to strip numbers') if (not self.identify(log, result_queue, abort, title=self.strip_accents( title.rstrip(string.digits)), authors=self.strip_accents(authors), timeout=30, nested=True)): log.error('No matches found with query: %r' % query) return # log.info('Lets process matches ...') from calibre_plugins.CBDB.worker import Worker workers = [ Worker(url, result_queue, br, log, i, self) for i, url in enumerate(matches) ] for w in workers: w.start() # Don't send all requests at the same time time.sleep(0.1) while not abort.is_set(): a_worker_is_alive = False for w in workers: w.join(0.2) if abort.is_set(): break if w.is_alive(): a_worker_is_alive = True if not a_worker_is_alive: break return None
def get_usb_info(usbdev, debug=False): # {{{ ''' The USB info (manufacturer/product names and serial number) Requires communication with the hub the device is connected to. :param usbdev: A usb device as returned by :function:`scan_usb_devices` ''' ans = {} hub_map = { devinfo.DevInst: path for devinfo, path in DeviceSet( guid=GUID_DEVINTERFACE_USB_HUB).interfaces() } for parent in iterancestors(usbdev.devinst): parent_path = hub_map.get(parent) if parent_path is not None: break else: if debug: prints( 'Cannot get USB info as parent of device is not a HUB or device has no parent (was probably disconnected)' ) return ans for devlist, devinfo in DeviceSet( guid=GUID_DEVINTERFACE_USB_DEVICE).devices(): if devinfo.DevInst == usbdev.devinst: device_port = get_device_registry_property(devlist, byref(devinfo), SPDRP_ADDRESS)[1] break else: return ans if not device_port: if debug: prints( 'Cannot get usb info as the SPDRP_ADDRESS property is not present in the registry (can happen with broken USB hub drivers)' ) return ans handle = CreateFile(parent_path, GENERIC_READ | GENERIC_WRITE, FILE_SHARE_READ | FILE_SHARE_WRITE, None, OPEN_EXISTING, 0, None) try: buf, dd = get_device_descriptor(handle, device_port) if dd.idVendor == usbdev.vendor_id and dd.idProduct == usbdev.product_id and dd.bcdDevice == usbdev.bcd: # Dont need to read language since we only care about english names # buf, langs = get_device_languages(handle, device_port) # print(111, langs) for index, name in ((dd.iManufacturer, 'manufacturer'), (dd.iProduct, 'product'), (dd.iSerialNumber, 'serial_number')): if index: try: buf, ans[name] = get_device_string(handle, device_port, index, buf=buf) except OSError as err: if debug: # Note that I have observed that this fails # randomly after some time of my Kindle being # connected. Disconnecting and reconnecting causes # it to start working again. prints( 'Failed to read %s from device, with error: [%d] %s' % (name, err.winerror, as_unicode(err))) finally: CloseHandle(handle) return ans
def format_errorstack(self, errs): return '\n'.join('%d:%s' % (code, as_unicode(msg)) for code, msg in errs)
def open(self, connected_device, library_uuid): self.dev = self._filesystem_cache = None try: self.dev = self.create_device(connected_device) except Exception as e: self.blacklisted_devices.add(connected_device) raise OpenFailed('Failed to open %s: Error: %s' % (connected_device, as_unicode(e))) try: storage = sorted(self.dev.storage_info, key=operator.itemgetter('id')) except self.libmtp.MTPError as e: if "The device has no storage information." in unicode_type(e): # This happens on newer Android devices while waiting for # the user to allow access. Apparently what happens is # that when the user clicks allow, the device disconnects # and re-connects as a new device. name = self.dev.friendly_name or '' if not name: if connected_device.manufacturer: name = connected_device.manufacturer if connected_device.product: name = name and (name + ' ') name += connected_device.product name = name or _('Unnamed device') raise OpenActionNeeded( name, _('The device {0} is not allowing connections.' ' Unlock the screen on the {0}, tap "Allow" on any connection popup message you see,' ' then either wait a minute or restart calibre. You might' ' also have to change the mode of the USB connection on the {0}' ' to "Media Transfer mode (MTP)" or similar.').format( name), (name, self.dev.serial_number)) raise storage = [x for x in storage if x.get('rw', False)] if not storage: self.blacklisted_devices.add(connected_device) raise OpenFailed('No storage found for device %s' % (connected_device, )) snum = self.dev.serial_number if snum in self.prefs.get('blacklist', []): self.blacklisted_devices.add(connected_device) self.dev = None raise BlacklistedDevice( 'The %s device has been blacklisted by the user' % (connected_device, )) self._main_id = storage[0]['id'] self._carda_id = self._cardb_id = None if len(storage) > 1: self._carda_id = storage[1]['id'] if len(storage) > 2: self._cardb_id = storage[2]['id'] self.current_friendly_name = self.dev.friendly_name if not self.current_friendly_name: self.current_friendly_name = self.dev.model_name or _( 'Unknown MTP device') self.current_serial_num = snum self.currently_connected_dev = connected_device
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=90): # {{{ from calibre.ebooks.chardet import xml_to_unicode from HTMLParser import HTMLParser from lxml import etree, html if not self.is_configured(): return query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: err = u'Insufficient metadata to construct query' log.error(err) return err try: raw = self.browser.open_novisit(query).read() except Exception as e: log.exception(u'Failed to make identify query: %r' % query) return as_unicode(e) try: doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0]) entries_block = doc.xpath(u'//div[@class="bSearchResult"]') if entries_block: entries = doc.xpath( u'//div[contains(@itemprop, "itemListElement")]') # for entry in entries: # log.debug('entries %s' % entree.tostring(entry)) metadata = self.get_metadata(log, entries, title, authors, identifiers) self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout) else: # Redirect page: trying to extract ozon_id from javascript data h = HTMLParser() entry_string = (h.unescape( unicode(etree.tostring(doc, pretty_print=True)))) id_title_pat = re.compile( u'products":\[{"id":(\d{7}),"name":"([а-яА-Я :\-0-9]+)') # result containing ozon_id and entry_title entry_info = re.search(id_title_pat, entry_string) ozon_id = entry_info.group(1) if entry_info else None entry_title = entry_info.group(2) if entry_info else None if ozon_id: metadata = self.to_metadata_for_single_entry( log, ozon_id, entry_title, authors) identifiers['ozon'] = ozon_id self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, cachedPagesDict={}) else: log.error('No SearchResults in Ozon.ru response found') except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e)
def ws_data_received(self, data, opcode, frame_starting, frame_finished, is_final_frame_of_message): if opcode in CONTROL_CODES: return self.ws_control_frame(opcode, data) message_starting = self.current_recv_opcode is None if message_starting: if opcode == CONTINUATION: self.log.error('Client sent continuation frame with no message to continue') self.websocket_close(PROTOCOL_ERROR, 'Continuation frame without any message to continue') return self.current_recv_opcode = opcode elif frame_starting and opcode != CONTINUATION: self.log.error('Client sent continuation frame with non-zero opcode') self.websocket_close(PROTOCOL_ERROR, 'Continuation frame with non-zero opcode') return message_finished = frame_finished and is_final_frame_of_message if self.current_recv_opcode == TEXT: if message_starting: self.frag_decoder.reset() empty_data = len(data) == 0 try: data = self.frag_decoder(data) except ValueError: self.frag_decoder.reset() self.log.error('Client sent undecodeable UTF-8') return self.websocket_close(INCONSISTENT_DATA, 'Not valid UTF-8') if message_finished: if (not data and not empty_data) or self.frag_decoder.state: self.frag_decoder.reset() self.log.error('Client sent undecodeable UTF-8') return self.websocket_close(INCONSISTENT_DATA, 'Not valid UTF-8') if message_finished: self.current_recv_opcode = None self.frag_decoder.reset() try: self.handle_websocket_data(data, message_starting, message_finished) except Exception as err: self.log.exception('Error in WebSockets data handler:') self.websocket_close(UNEXPECTED_ERROR, 'Unexpected error in handler: %r' % as_unicode(err))
def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): from urlparse import parse_qs book_url = self._get_book_url(identifiers.get('edelweiss', None)) br = self.browser if book_url: entries = [(book_url, identifiers['edelweiss'])] else: entries = [] query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: log.error('Insufficient metadata to construct query') return log('Using query URL:', query) try: raw = br.open_novisit(query, timeout=timeout).read() except Exception as e: log.exception('Failed to make identify query: %r'%query) return as_unicode(e) try: root = parse_html(raw) except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) from css_selectors import Select select = Select(root) has_isbn = check_isbn(identifiers.get('isbn', None)) is not None if not has_isbn: author_tokens = set(x.lower() for x in self.get_author_tokens(authors, only_first_author=True)) for entry in select('div.listRow div.listRowMain'): a = entry.xpath('descendant::a[contains(@href, "sku=") and contains(@href, "productDetailPage.aspx")]') if not a: continue href = a[0].get('href') prefix, qs = href.partition('?')[0::2] sku = parse_qs(qs).get('sku', None) if sku and sku[0]: sku = sku[0] div = tuple(select('div.sku.attGroup')) if div: text = astext(div[0]) isbns = [check_isbn(x.strip()) for x in text.split(',')] for isbn in isbns: if isbn: self.cache_isbn_to_identifier(isbn, sku) for img in entry.xpath('descendant::img[contains(@src, "/jacket_covers/thumbnail/")]'): self.cache_identifier_to_cover_url(sku, img.get('src').replace('/thumbnail/', '/flyout/')) div = tuple(select('div.format.attGroup')) text = astext(div[0]).lower() if 'audio' in text or 'mp3' in text: # Audio-book, ignore continue if not has_isbn: # edelweiss returns matches based only on title, so we # filter by author manually div = tuple(select('div.contributor.attGroup')) try: entry_authors = set(self.get_author_tokens([x.strip() for x in astext(div[0]).lower().split(',')])) except IndexError: entry_authors = set() if not entry_authors.issuperset(author_tokens): continue entries.append((self._get_book_url(sku), sku)) if (not entries and identifiers and title and authors and not abort.is_set()): return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) if not entries: return workers = [Worker(skul, url, i, result_queue, br.clone_browser(), timeout, log, self) for i, (url, skul) in enumerate(entries[:5])] for w in workers: w.start() # Don't send all requests at the same time time.sleep(0.1) while not abort.is_set(): a_worker_is_alive = False for w in workers: w.join(0.2) if abort.is_set(): break if w.is_alive(): a_worker_is_alive = True if not a_worker_is_alive: break
def vacuum(self): try: self.db.vacuum() except Exception as e: import traceback self.error = (as_unicode(e), traceback.format_exc())
def subset_all_fonts(container, font_stats, report): remove = set() total_old = total_new = 0 changed = False for name, mt in iter_subsettable_fonts(container): chars = font_stats.get(name, set()) with container.open(name, 'rb') as f: f.seek(0, os.SEEK_END) total_old += f.tell() if not chars: remove.add(name) report(_('Removed unused font: %s')%name) continue with container.open(name, 'r+b') as f: raw = f.read() try: font_name = get_font_names(raw)[-1] except Exception as e: container.log.warning( 'Corrupted font: %s, ignoring. Error: %s'%( name, as_unicode(e))) continue warnings = [] container.log('Subsetting font: %s'%(font_name or name)) try: nraw, old_sizes, new_sizes = subset(raw, chars, warnings=warnings) except UnsupportedFont as e: container.log.warning( 'Unsupported font: %s, ignoring. Error: %s'%( name, as_unicode(e))) continue for w in warnings: container.log.warn(w) olen = sum(itervalues(old_sizes)) nlen = sum(itervalues(new_sizes)) total_new += len(nraw) if nlen == olen: report(_('The font %s was already subset')%font_name) else: report(_('Decreased the font {0} to {1} of its original size').format( font_name, ('%.1f%%' % (nlen/olen * 100)))) changed = True f.seek(0), f.truncate(), f.write(nraw) for name in remove: container.remove_item(name) changed = True if remove: for name, mt in iteritems(container.mime_map): if mt in OEB_STYLES: sheet = container.parsed(name) if remove_font_face_rules(container, sheet, remove, name): container.dirty(name) elif mt in OEB_DOCS: for style in XPath('//h:style')(container.parsed(name)): if style.get('type', 'text/css') == 'text/css' and style.text: sheet = container.parse_css(style.text, name) if remove_font_face_rules(container, sheet, remove, name): style.text = css_text(sheet) container.dirty(name) if total_old > 0: report(_('Reduced total font size to %.1f%% of original')%( total_new/total_old*100)) else: report(_('No embedded fonts found')) return changed
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): ''' Note this method will retry without identifiers automatically if no match is found with identifiers. ''' matches = [] # Unlike the other metadata sources, if we have a kyobobook id then we # do not need to fire a "search" at kyobobook.com. Instead we will be # able to go straight to the URL for that book. kyobobook_id = identifiers.get('kyobobook', None) isbn = check_isbn(identifiers.get('isbn', None)) br = self.browser if kyobobook_id: matches.append('%s/product/detailViewKor.laf?barcode=%s' % (Kyobobook.BASE_URL, kyobobook_id)) else: query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if query is None: log.error('Insufficient metadata to construct query') return try: log.info('Querying: %s' % query) response = br.open_novisit(query, timeout=timeout) try: raw = response.read().strip() # open('E:\\t11.html', 'wb').write(raw) # XXXX # by sseeookk # euc-kr at kyobobook # raw = raw.decode('utf-8', errors='replace') # raw = raw.decode('euc-kr', errors='replace') raw = raw.decode('euc-kr', errors='ignore') if not raw: log.error('Failed to get raw result for query: %r' % query) return root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse kyobobook page for query: %r' % query log.exception(msg) return msg if isbn: self._parse_search_isbn_results(log, isbn, root, matches, timeout) # For ISBN based searches we have already done everything we need to # So anything from this point below is for title/author based searches. if not isbn: # Now grab the first value from the search results, provided the # title and authors appear to be for the same book self._parse_search_results(log, title, authors, root, matches, timeout) except Exception as e: err = 'Failed to make identify query: %r' % query log.exception(err) return as_unicode(e) if abort.is_set(): return if not matches: if identifiers and title and authors: log.info( 'No matches found with identifiers, retrying using only' ' title and authors') return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) log.error('No matches found with query: %r' % query) return from calibre_plugins.kyobobook.worker import Worker workers = [ Worker(url, result_queue, br, log, i, self) for i, url in enumerate(matches) ] for w in workers: w.start() # Don't send all requests at the same time time.sleep(0.1) while not abort.is_set(): a_worker_is_alive = False for w in workers: w.join(0.2) if abort.is_set(): break if w.is_alive(): a_worker_is_alive = True if not a_worker_is_alive: break return None
def find_page_breaks(self, item): if self.page_break_selectors is None: self.page_break_selectors = set() stylesheets = [ x.data for x in self.oeb.manifest if x.media_type in OEB_STYLES ] for rule in rules(stylesheets): before = getattr( rule.style.getPropertyCSSValue('page-break-before'), 'cssText', '').strip().lower() after = getattr( rule.style.getPropertyCSSValue('page-break-after'), 'cssText', '').strip().lower() try: if before and before not in {'avoid', 'auto', 'inherit'}: self.page_break_selectors.add( (rule.selectorText, True)) if self.remove_css_pagebreaks: rule.style.removeProperty('page-break-before') except: pass try: if after and after not in {'avoid', 'auto', 'inherit'}: self.page_break_selectors.add( (rule.selectorText, False)) if self.remove_css_pagebreaks: rule.style.removeProperty('page-break-after') except: pass page_breaks = set() select = Select(item.data) if not self.page_break_selectors: return [], [] body = item.data.xpath('//h:body', namespaces=NAMESPACES) if not body: return [], [] descendants = frozenset(body[0].iterdescendants('*')) for selector, before in self.page_break_selectors: try: for elem in select(selector): if elem in descendants and elem.tag.rpartition( '}')[2].lower() not in { 'html', 'body', 'head', 'style', 'script', 'meta', 'link' }: elem.set('pb_before', '1' if before else '0') page_breaks.add(elem) except SelectorError as err: self.log.warn( 'Ignoring page breaks specified with invalid CSS selector: %r (%s)' % (selector, as_unicode(err))) for i, elem in enumerate(item.data.iter('*')): try: elem.set('pb_order', str(i)) except TypeError: # Cant set attributes on comment nodes etc. continue page_breaks = list(page_breaks) page_breaks.sort(key=lambda x: int(x.get('pb_order'))) page_break_ids, page_breaks_ = [], [] for i, x in enumerate(page_breaks): x.set('id', x.get('id', 'calibre_pb_%d' % i)) id = x.get('id') try: xp = XPath('//*[@id="%s"]' % id) except: try: xp = XPath("//*[@id='%s']" % id) except: # The id has both a quote and an apostrophe or some other # Just replace it since I doubt its going to work anywhere else # either id = 'calibre_pb_%d' % i x.set('id', id) xp = XPath('//*[@id=%r]' % id) page_breaks_.append((xp, x.get('pb_before', '0') == '1')) page_break_ids.append(id) for elem in item.data.iter(etree.Element): elem.attrib.pop('pb_order', False) elem.attrib.pop('pb_before', False) return page_breaks_, page_break_ids
def identify( self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): ''' Note this method will retry without identifiers automatically if no match is found with identifiers. ''' from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode from lxml.html import tostring import html5lib testing = getattr(self, 'running_a_test', False) query, domain = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if query is None: log.error('Insufficient metadata to construct query') return br = self.browser if testing: print('Using user agent for amazon: %s' % self.user_agent) try: raw = br.open_novisit(query, timeout=timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: log.error('Query malformed: %r' % query) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = _('Amazon timed out. Try again later.') log.error(msg) else: msg = 'Failed to make identify query: %r' % query log.exception(msg) return as_unicode(msg) raw = clean_ascii_chars( xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]) if testing: import tempfile with tempfile.NamedTemporaryFile(prefix='amazon_results_', suffix='.html', delete=False) as f: f.write(raw.encode('utf-8')) print('Downloaded html for results page saved in', f.name) matches = [] found = '<title>404 - ' not in raw if found: try: root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) except: msg = 'Failed to parse amazon page for query: %r' % query log.exception(msg) return msg errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = tostring(errmsg, method='text', encoding=unicode).strip() log.error(msg) # The error is almost always a not found error found = False if found: matches = self.parse_results_page(root) if abort.is_set(): return if not matches: if identifiers and title and authors: log('No matches found with identifiers, retrying using only' ' title and authors. Query: %r' % query) return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) log.error('No matches found with query: %r' % query) return workers = [ Worker(url, result_queue, br, log, i, domain, self, testing=testing) for i, url in enumerate(matches) ] for w in workers: w.start() # Don't send all requests at the same time time.sleep(0.1) while not abort.is_set(): a_worker_is_alive = False for w in workers: w.join(0.2) if abort.is_set(): break if w.is_alive(): a_worker_is_alive = True if not a_worker_is_alive: break return None