Esempio n. 1
0
def index():

    to_delete = []
    for e in EpubArchive.objects.filter(identifier='').order_by('id')[0:10]:
        log.debug("Processing %s" % e.title)
        # Make sure this is valid at all
        try:
            util.xml_from_string(e.opf)
        except InvalidEpubException:
            log.debug("Will delete %s (SocialBooks ID %s)" % (e.title, e.id))
            to_delete.append(e)
            continue
            
        for h in HTMLFile.objects.filter(archive=e):
            try:
                if not h.processed_content:
                    log.debug("Rendering HTML content for %s:%s" % (e.title, h.filename))
                    h.render()
            except Exception, e1:
                log.error(e1)
                h.words = "[Unsupported language]"
            if not h.words:
                h.words = epubindexer.get_searchable_content(h.processed_content)                
            h.save()
        log.debug("Done processing HTML for %s" % e.title)
        # If we get None from any of these metadata items, then the document is
        # invalid
        e.get_subjects()
        e.get_rights()
        e.get_language()
        e.get_publisher()
        e.get_identifier()
        log.debug("Saving %s" % e.title)
        e.save()
Esempio n. 2
0
def validate(data, fail_silently=True):
    '''Sends a value of data to the epubcheck validation service at threepress.org and parses the response.
    `data` should be an epub as a stream of bytes or a file-like object (that implements read())

    By default, exceptions are ignored (the service may be down).

    Returns either True if the file is valid, or a list of errors if the file is not valid.
    '''
    if hasattr(data, 'read'):
        # This will have already been read, so seek back
        data.seek(0)
        resp = urllib.urlopen(settings.EPUBCHECK_WEBSERVICE, data.read()).read()
    else:
        resp = urllib.urlopen(settings.EPUBCHECK_WEBSERVICE, data).read()
    try:
        
        epubcheck_response =  toc.xml_from_string(resp)
        if epubcheck_response.findtext('.//is-valid') == 'True':
            return []
        elif epubcheck_response.findtext('.//is-valid') == 'False':
            return epubcheck_response.findall('.//error')
    except Exception, e:
        if fail_silently:
            log.warn("Failure during epubcheck: %s (response was %s)" % (e, resp))
        else:
            raise e
Esempio n. 3
0
    def explode(self):
        '''Explodes an epub archive'''
        z = ZipFile(self.get_content()) # Returns a filehandle
        try:
            container = z.read(self._CONTAINER)
        except KeyError:
            # Is this DOS-format?  If so, handle this as a special error
            try:
                container = z.read(self._CONTAINER.replace('/', '\\'))
                raise InvalidEpubException("This ePub file was created with DOS/Windows path separators, which is not legal according to the PKZIP specification.")
            except KeyError:
                raise InvalidEpubException('Was not able to locate container file %s' % self._CONTAINER, archive=self)
        
        try:
            z.read(constants.RIGHTS)
            raise DRMEpubException()
        except KeyError:
            pass

        parsed_container = util.xml_from_string(container)

        opf_filename = self._get_opf_filename(parsed_container)

        content_path = self._get_content_path(opf_filename)
        self.opf = z.read(opf_filename)
        parsed_opf = util.xml_from_string(self.opf)

        items = [i for i in parsed_opf.iterdescendants(tag="{%s}item" % (NS['opf']))]
        
        toc_filename = self._get_toc(parsed_opf, items, content_path)
        try:
            self.toc = z.read(toc_filename)
        except KeyError:
            raise InvalidEpubException('TOC file was referenced in OPF, but not found in archive: toc file %s' % toc_filename, archive=self)

        parsed_toc = util.xml_from_string(self.toc)

        self.authors = self._get_authors(parsed_opf)
        self.orderable_author = self.safe_author()

        self.title = self._get_title(parsed_opf) 

        self._get_content(z, parsed_opf, parsed_toc, items, content_path)
        self._get_stylesheets(z, items, content_path)
        self._get_images(z, items, content_path)
Esempio n. 4
0
 def _get_metadata(self, metadata_tag, opf, plural=False, as_string=False, as_list=False):
     '''Returns a metdata item's text content by tag name, or a list if mulitple names match.
     If as_string is set to True, then always return a comma-delimited string.'''
     if self._parsed_metadata is None:
         try:
             self._parsed_metadata = util.xml_from_string(opf)
         except InvalidEpubException:
             return None
     text = []
     alltext = self._parsed_metadata.findall('.//{%s}%s' % (NS['dc'], metadata_tag))
     if as_list:
         return [t.text.strip() for t in alltext if t.text]
     if as_string:
         return ', '.join([t.text.strip() for t in alltext if t.text])
     for t in alltext:
         if t.text is not None:
             text.append(t.text)
     if len(text) == 1:
         t = (text[0], ) if plural else text[0]
         return t
     return text