def index(): to_delete = [] for e in EpubArchive.objects.filter(identifier='').order_by('id')[0:10]: log.debug("Processing %s" % e.title) # Make sure this is valid at all try: util.xml_from_string(e.opf) except InvalidEpubException: log.debug("Will delete %s (Bookworm ID %s)" % (e.title, e.id)) to_delete.append(e) continue for h in HTMLFile.objects.filter(archive=e): try: if not h.processed_content: log.debug("Rendering HTML content for %s:%s" % (e.title, h.filename)) h.render() except Exception, e1: log.error(e1) h.words = "[Unsupported language]" if not h.words: h.words = epubindexer.get_searchable_content(h.processed_content) h.save() log.debug("Done processing HTML for %s" % e.title) # If we get None from any of these metadata items, then the document is # invalid e.get_subjects() e.get_rights() e.get_language() e.get_publisher() e.get_identifier() log.debug("Saving %s" % e.title) e.save()
def validate(data, fail_silently=True): '''Sends a value of data to the epubcheck validation service at threepress.org and parses the response. `data` should be an epub as a stream of bytes or a file-like object (that implements read()) By default, exceptions are ignored (the service may be down). Returns either True if the file is valid, or a list of errors if the file is not valid. ''' if hasattr(data, 'read'): # This will have already been read, so seek back data.seek(0) resp = urllib.urlopen(settings.EPUBCHECK_WEBSERVICE, data.read()).read() else: resp = urllib.urlopen(settings.EPUBCHECK_WEBSERVICE, data).read() try: epubcheck_response = toc.xml_from_string(resp) if epubcheck_response.findtext('.//is-valid') == 'True': return [] elif epubcheck_response.findtext('.//is-valid') == 'False': return epubcheck_response.findall('.//error') except Exception, e: if fail_silently: log.warn("Failure during epubcheck: %s (response was %s)" % (e, resp)) else: raise e
def explode(self): '''Explodes an epub archive''' z = ZipFile(self.get_content()) # Returns a filehandle try: container = z.read(self._CONTAINER) except KeyError: # Is this DOS-format? If so, handle this as a special error try: container = z.read(self._CONTAINER.replace('/', '\\')) raise InvalidEpubException("This ePub file was created with DOS/Windows path separators, which is not legal according to the PKZIP specification.") except KeyError: raise InvalidEpubException('Was not able to locate container file %s' % self._CONTAINER, archive=self) try: z.read(constants.RIGHTS) raise DRMEpubException() except KeyError: pass parsed_container = util.xml_from_string(container) opf_filename = self._get_opf_filename(parsed_container) content_path = self._get_content_path(opf_filename) self.opf = z.read(opf_filename) parsed_opf = util.xml_from_string(self.opf) items = [i for i in parsed_opf.iterdescendants(tag="{%s}item" % (NS['opf']))] toc_filename = self._get_toc(parsed_opf, items, content_path) try: self.toc = z.read(toc_filename) except KeyError: raise InvalidEpubException('TOC file was referenced in OPF, but not found in archive: toc file %s' % toc_filename, archive=self) parsed_toc = util.xml_from_string(self.toc) self.authors = self._get_authors(parsed_opf) self.orderable_author = self.safe_author() self.title = self._get_title(parsed_opf) self._get_content(z, parsed_opf, parsed_toc, items, content_path) self._get_stylesheets(z, items, content_path) self._get_images(z, items, content_path)
def explode(self): z = ZipFile(self.get_content()) # Returns a filehandleelf.get_content()) z = ZipFile(e) try: container = z.read(self._CONTAINER) except KeyError: raise InvalidEpubException('Was not able to locate container file %s' % self._CONTAINER, archive=self) try: z.read(constants.RIGHTS) raise DRMEpubException() except KeyError: pass parsed_container = util.xml_from_string(container) opf_filename = self._get_opf_filename(parsed_container) content_path = self._get_content_path(opf_filename) self.opf = z.read(opf_filename) parsed_opf = util.xml_from_string(self.opf) items = [i for i in parsed_opf.iterdescendants(tag="{%s}item" % (NS['opf']))] toc_filename = self._get_toc(parsed_opf, items, content_path) try: self.toc = z.read(toc_filename) except KeyError: raise InvalidEpubException('TOC file was referenced in OPF, but not found in archive: toc file %s' % toc_filename, archive=self) parsed_toc = util.xml_from_string(self.toc) self.authors = self._get_authors(parsed_opf) self.orderable_author = self.safe_author() self.title = self._get_title(parsed_opf) self._get_content(z, parsed_opf, parsed_toc, items, content_path) self._get_stylesheets(z, items, content_path) self._get_images(z, items, content_path)
def _get_metadata(self, metadata_tag, opf, plural=False, as_string=False, as_list=False): '''Returns a metdata item's text content by tag name, or a list if mulitple names match. If as_string is set to True, then always return a comma-delimited string.''' if self._parsed_metadata is None: try: self._parsed_metadata = util.xml_from_string(opf) except InvalidEpubException: return None text = [] alltext = self._parsed_metadata.findall('.//{%s}%s' % (NS['dc'], metadata_tag)) if as_list: return [t.text.strip() for t in alltext if t.text] if as_string: return ', '.join([t.text.strip() for t in alltext if t.text]) for t in alltext: if t.text is not None: text.append(t.text) if len(text) == 1: t = (text[0], ) if plural else text[0] return t return text
html.save() except DjangoUnicodeDecodeError: raise InvalidEpubException(_("There was a problem related to the encoding of one of the documents in your ePub. All ePub documents must be in UTF-8.")) except _mysql_exceptions.Warning: raise InvalidEpubException(_("There was a problem related to the encoding of one of the documents in your ePub. All ePub documents must be in UTF-8.")) except MySQLdb.OperationalError, e: if 'Incorrect string value' in str(e): raise InvalidEpubException(_("There was a problem related to the encoding of one of the documents in your ePub. All ePub documents must be in UTF-8.")) else: raise e u def _get_metadata(self, metadata_tag, opf, plural=False, as_string=False, as_list=False): '''Returns a metdata item's text content by tag name, or a list if mulitple names match. If as_string is set to True, then always return a comma-delimited string.''' if self._parsed_metadata is None: try: self._parsed_metadata = util.xml_from_string(opf) except InvalidEpubException: return None text = [] alltext = self._parsed_metadata.findall('.//{%s}%s' % (NS['dc'], metadata_tag)) if as_list: return [t.text.strip() for t in alltext if t.text] if as_string: return ', '.join([t.text.strip() for t in alltext if t.text]) for t in alltext: if t.text is not None: text.append(t.text) if len(text) == 1: t = (text[0], ) if plural else text[0] return t return text