Esempio n. 1
0
def get_metadata(stream, cover=True):
    with TemporaryDirectory('_pdf_metadata_read') as pdfpath:
        stream.seek(0)
        with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f:
            shutil.copyfileobj(stream, f)
        info = read_info(pdfpath, bool(cover))
        if info is None:
            raise ValueError('Could not read info dict from PDF')
        covpath = os.path.join(pdfpath, 'cover.jpg')
        cdata = None
        if cover and os.path.exists(covpath):
            with open(covpath, 'rb') as f:
                cdata = f.read()

    title = info.get('Title', None) or 'Unknown'
    au = info.get('Author', None)
    if au is None:
        au = ['Unknown']
    else:
        au = string_to_authors(au)
    mi = MetaInformation(title, au)

    creator = info.get('Creator', None)
    if creator:
        mi.book_producer = creator

    keywords = info.get('Keywords', None)
    mi.tags = []
    if keywords:
        mi.tags = [x.strip() for x in keywords.split(',')]
        isbn = [check_isbn(x) for x in mi.tags if check_isbn(x)]
        if isbn:
            mi.isbn = isbn = isbn[0]
        mi.tags = [x for x in mi.tags if check_isbn(x) != isbn]

    subject = info.get('Subject', None)
    if subject:
        mi.tags.insert(0, subject)

    if 'xmp_metadata' in info:
        from ebook_converter.ebooks.metadata.xmp import consolidate_metadata
        mi = consolidate_metadata(mi, info)

    # Look for recognizable identifiers in the info dict, if they were not
    # found in the XMP metadata
    for scheme, check_func in {'doi': check_doi, 'isbn': check_isbn}.items():
        if scheme not in mi.get_identifiers():
            for k, v in info.items():
                if k != 'xmp_metadata':
                    val = check_func(v)
                    if val:
                        mi.set_identifier(scheme, val)
                        break

    if cdata:
        mi.cover_data = ('jpeg', cdata)
    return mi
Esempio n. 2
0
def _parse_isbn(root, mi, ctx):
    # some people try to put several isbn in this field, but it is not
    # allowed. try to stick to the 1-st one in this case
    isbn = ctx.XPath('normalize-space(//fb:publish-info/fb:isbn/text())')(root)
    if isbn:
        # some people try to put several isbn in this field, but it is not
        # allowed. try to stick to the 1-st one in this case
        if ',' in isbn:
            isbn = isbn[:isbn.index(',')]
        if check_isbn(isbn):
            mi.isbn = isbn
Esempio n. 3
0
 def finalize(scheme, val):
     if not scheme or not val:
         return None, None
     scheme = scheme.lower()
     if scheme in ('http', 'https'):
         return None, None
     if scheme.startswith('isbn'):
         scheme = 'isbn'
     if scheme == 'isbn':
         val = val.split(':')[-1]
         val = check_isbn(val)
         if val is None:
             return None, None
     return scheme, val
Esempio n. 4
0
 def process_metadata(self, idx, content, codec):
     if idx == 100:
         if self.mi.is_null('authors'):
             self.mi.authors = []
         au = clean_xml_chars(self.decode(content).strip())
         # Author names in Amazon  MOBI files are usually in LN, FN format,
         # try to detect and auto-correct that.
         m = re.match(r'([^,]+?)\s*,\s+([^,]+)$', au.strip())
         if m is not None:
             if tweaks['author_sort_copy_method'] != 'copy':
                 self.mi.authors.append(m.group(2) + ' ' + m.group(1))
             else:
                 self.mi.authors.append(m.group())
             if self.mi.is_null('author_sort'):
                 self.mi.author_sort = m.group()
         else:
             self.mi.authors.append(au)
     elif idx == 101:
         self.mi.publisher = clean_xml_chars(self.decode(content).strip())
         if self.mi.publisher in {'Unknown', 'Unknown'}:
             self.mi.publisher = None
     elif idx == 103:
         self.mi.comments = clean_xml_chars(self.decode(content).strip())
     elif idx == 104:
         raw = check_isbn(self.decode(content).strip().replace('-', ''))
         if raw:
             self.mi.isbn = raw
     elif idx == 105:
         if not self.mi.tags:
             self.mi.tags = []
         self.mi.tags.extend([
             x.strip()
             for x in clean_xml_chars(self.decode(content)).split(';')
         ])
         self.mi.tags = uniq(self.mi.tags)
     elif idx == 106:
         try:
             self.mi.pubdate = parse_date(self.decode(content),
                                          as_utc=False)
         except Exception:
             pass
     elif idx == 108:
         self.mi.book_producer = clean_xml_chars(
             self.decode(content).strip())
     elif idx == 109:
         self.mi.rights = clean_xml_chars(self.decode(content).strip())
     elif idx == 112:  # dc:source set in some EBSP amazon samples
         try:
             content = content.decode(codec).strip()
             isig = 'urn:isbn:'
             if content.lower().startswith(isig):
                 raw = check_isbn(content[len(isig):])
                 if raw and not self.mi.isbn:
                     self.mi.isbn = raw
             elif content.startswith('calibre:'):
                 # calibre book uuid is stored here by recent calibre
                 # releases
                 cid = content[len('calibre:'):]
                 if cid:
                     self.mi.application_id = self.mi.uuid = cid
         except:
             pass
     elif idx == 113:  # ASIN or other id
         try:
             self.uuid = content.decode('ascii')
             self.mi.set_identifier('mobi-asin', self.uuid)
         except Exception:
             self.uuid = None
     elif idx == 116:
         self.start_offset, = struct.unpack(b'>L', content)
     elif idx == 121:
         self.kf8_header, = struct.unpack(b'>L', content)
         if self.kf8_header == NULL_INDEX:
             self.kf8_header = None
Esempio n. 5
0
def metadata_from_xmp_packet(raw_bytes):
    root = parse_xmp_packet(raw_bytes)
    mi = Metadata('Unknown')
    title = first_alt('//dc:title', root)
    if title:
        if title.startswith(r'\376\377'):
            # corrupted XMP packet generated by Nitro PDF. See
            # https://bugs.launchpad.net/calibre/+bug/1541981
            raise ValueError('Corrupted XMP metadata packet detected, '
                             'probably generated by Nitro PDF')
        mi.title = title
    authors = multiple_sequences('//dc:creator', root)
    if authors:
        mi.authors = authors
    tags = multiple_sequences('//dc:subject', root) or multiple_sequences(
        '//pdf:Keywords', root)
    if tags:
        mi.tags = tags
    comments = first_alt('//dc:description', root)
    if comments:
        mi.comments = comments
    publishers = multiple_sequences('//dc:publisher', root)
    if publishers:
        mi.publisher = publishers[0]
    try:
        pubdate = (parse_date(first_sequence('//dc:date', root)
                              or first_simple('//xmp:CreateDate', root),
                              assume_utc=False))
    except Exception:
        pass
    else:
        mi.pubdate = pubdate
    bkp = first_simple('//xmp:CreatorTool', root)
    if bkp:
        mi.book_producer = bkp
    md = safe_parse_date(first_simple('//xmp:MetadataDate', root))
    mod = safe_parse_date(first_simple('//xmp:ModifyDate', root))
    fd = more_recent(md, mod)
    if fd is not None:
        mi.metadata_date = fd
    rating = first_simple('//calibre:rating', root)
    if rating is not None:
        try:
            rating = float(rating)
            if 0 <= rating <= 10:
                mi.rating = rating
        except (ValueError, TypeError):
            pass
    series, series_index = read_series(root)
    if series:
        mi.series, mi.series_index = series, series_index
    for x in ('title_sort', 'author_sort'):
        for elem in XPath('//calibre:' + x)(root):
            val = read_simple_property(elem)
            if val:
                setattr(mi, x, val)
                break
    for x in ('author_link_map', 'user_categories'):
        val = first_simple('//calibre:' + x, root)
        if val:
            try:
                setattr(mi, x, json.loads(val))
            except Exception:
                pass

    languages = multiple_sequences('//dc:language', root)
    if languages:
        languages = list(filter(None, map(canonicalize_lang, languages)))
        if languages:
            mi.languages = languages

    identifiers = {}
    for xmpid in XPath('//xmp:Identifier')(root):
        for scheme, value in read_xmp_identifers(xmpid):
            if scheme and value:
                identifiers[scheme.lower()] = value

    for namespace in ('prism', 'pdfx'):
        for scheme in KNOWN_ID_SCHEMES:
            if scheme not in identifiers:
                val = first_simple('//%s:%s' % (namespace, scheme), root)
                scheme = scheme.lower()
                if scheme == 'isbn':
                    val = check_isbn(val)
                elif scheme == 'doi':
                    val = check_doi(val)
                if val:
                    identifiers[scheme] = val

    # Check Dublin Core for recognizable identifier types
    for scheme, check_func in {'doi': check_doi, 'isbn': check_isbn}.items():
        if scheme not in identifiers:
            val = check_func(first_simple('//dc:identifier', root))
            if val:
                identifiers['doi'] = val

    if identifiers:
        mi.set_identifiers(identifiers)

    read_user_metadata(mi, root)

    return mi
Esempio n. 6
0
def get_metadata(stream, extract_cover=True):
    whitespace = re.compile(r'\s+')

    def normalize(s):
        return whitespace.sub(' ', s).strip()

    with ZipFile(stream) as zf:
        meta = zf.read('meta.xml')
        root = fromstring(meta)

        def find(field):
            ns, tag = fields[field]
            ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns})
            if ans:
                return normalize(
                    tostring(ans[0],
                             method='text',
                             encoding='unicode',
                             with_tail=False)).strip()

        mi = MetaInformation(None, [])
        title = find('title')
        if title:
            mi.title = title
        creator = find('initial-creator') or find('creator')
        if creator:
            mi.authors = string_to_authors(creator)
        desc = find('description')
        if desc:
            mi.comments = desc
        lang = find('language')
        if lang and canonicalize_lang(lang):
            mi.languages = [canonicalize_lang(lang)]
        kw = find('keyword') or find('keywords')
        if kw:
            mi.tags = [x.strip() for x in kw.split(',') if x.strip()]
        data = {}
        for tag in root.xpath('//ns0:user-defined',
                              namespaces={'ns0': fields['user-defined'][0]}):
            name = (tag.get('{%s}name' % METANS) or '').lower()
            vtype = tag.get('{%s}value-type' % METANS) or 'string'
            val = tag.text
            if name and val:
                if vtype == 'boolean':
                    val = val == 'true'
                data[name] = val
        opfmeta = False  # we need this later for the cover
        opfnocover = False
        if data.get('opf.metadata'):
            # custom metadata contains OPF information
            opfmeta = True
            if data.get('opf.titlesort', ''):
                mi.title_sort = data['opf.titlesort']
            if data.get('opf.authors', ''):
                mi.authors = string_to_authors(data['opf.authors'])
            if data.get('opf.authorsort', ''):
                mi.author_sort = data['opf.authorsort']
            if data.get('opf.isbn', ''):
                isbn = check_isbn(data['opf.isbn'])
                if isbn is not None:
                    mi.isbn = isbn
            if data.get('opf.publisher', ''):
                mi.publisher = data['opf.publisher']
            if data.get('opf.pubdate', ''):
                mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
            if data.get('opf.identifiers'):
                try:
                    mi.identifiers = json.loads(data['opf.identifiers'])
                except Exception:
                    pass
            if data.get('opf.rating'):
                try:
                    mi.rating = max(0, min(float(data['opf.rating']), 10))
                except Exception:
                    pass
            if data.get('opf.series', ''):
                mi.series = data['opf.series']
                if data.get('opf.seriesindex', ''):
                    try:
                        mi.series_index = float(data['opf.seriesindex'])
                    except Exception:
                        mi.series_index = 1.0
            if data.get('opf.language', ''):
                cl = canonicalize_lang(data['opf.language'])
                if cl:
                    mi.languages = [cl]
            opfnocover = data.get('opf.nocover', False)
        if not opfnocover:
            try:
                read_cover(stream, zf, mi, opfmeta, extract_cover)
            except Exception:
                pass  # Do not let an error reading the cover prevent reading other data

    return mi