Ejemplo n.º 1
0
 def test_input_meta_multi(self):
     stream_meta = get_metadata(self.get_stream('meta_multi'))
     canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington', 'John Adams', 'Thomas Jefferson'])
     canon_meta.publisher = 'Publisher A'
     canon_meta.languages = ['English', 'Spanish']
     canon_meta.pubdate = parse_date('2019-01-01')
     canon_meta.timestamp = parse_date('2018-01-01')
     canon_meta.series = 'Meta Series'
     canon_meta.series_index = float(1)
     canon_meta.rating = float(8)
     canon_meta.comments = 'meta "comments" ♥ HTML &'
     canon_meta.tags = ['tag a', 'tag b', 'tag c']
     canon_meta.set_identifiers({'isbn': '1234567890', 'url': 'http://google.com/search?q=calibre'})
     self.compare_metadata(stream_meta, canon_meta)
Ejemplo n.º 2
0
 def test_input_comment_multi(self):
     stream_meta = get_metadata(self.get_stream('comment_multi'))
     canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe', 'John Quincy Adams'])
     canon_meta.publisher = 'Publisher C'
     canon_meta.languages = ['French', 'Japanese']
     canon_meta.pubdate = parse_date('2015-01-01')
     canon_meta.timestamp = parse_date('2014-01-01')
     canon_meta.series = 'Comment Series'
     canon_meta.series_index = float(3)
     canon_meta.rating = float(0)
     canon_meta.comments = 'comment "comments" ♥ HTML -- too &'
     canon_meta.tags = ['tag d', 'tag e', 'tag f']
     canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'})
     self.compare_metadata(stream_meta, canon_meta)
Ejemplo n.º 3
0
 def test_input_meta_single(self):
     stream_meta = get_metadata(self.get_stream('meta_single'))
     canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington'])
     canon_meta.publisher = 'Publisher A'
     canon_meta.languages = ['English']
     canon_meta.pubdate = parse_date('2019-01-01')
     canon_meta.timestamp = parse_date('2018-01-01')
     canon_meta.series = 'Meta Series'
     canon_meta.series_index = float(1)
     # canon_meta.rating = float(0)
     # canon_meta.comments = ''
     canon_meta.tags = ['tag a', 'tag b']
     canon_meta.set_identifiers({'isbn': '1234567890'})
     self.compare_metadata(stream_meta, canon_meta)
Ejemplo n.º 4
0
def consolidate_metadata(info_mi, info):
    ''' When both the PDF Info dict and XMP metadata are present, prefer the xmp
    metadata unless the Info ModDate is never than the XMP MetadataDate. This
    is the algorithm recommended by the PDF spec. '''
    try:
        raw = info['xmp_metadata'].rstrip()
        if not raw:
            return info_mi
        xmp_mi = metadata_from_xmp_packet(raw)
    except Exception:
        import traceback
        traceback.print_exc()
        return info_mi
    info_title = info_mi.title or 'Unknown'
    info_authors = list(info_mi.authors or ())
    info_tags = list(info_mi.tags or ())
    info_mi.smart_update(xmp_mi, replace_metadata=True)
    prefer_info = False
    if 'ModDate' in info and hasattr(xmp_mi, 'metadata_date'):
        try:
            info_date = parse_date(info['ModDate'])
        except Exception:
            pass
        else:
            prefer_info = info_date > xmp_mi.metadata_date
    if prefer_info:
        info_mi.title = info_title
        info_mi.authors = info_authors
        info_mi.tags = info_tags
    else:
        # We'll use the xmp tags/authors but fallback to the info ones if the
        # xmp does not have tags/authors. smart_update() should have taken care
        # of the rest
        info_mi.authors = (info_authors
                           if xmp_mi.is_null('authors') else xmp_mi.authors)
        info_mi.tags = xmp_mi.tags or info_tags
    return info_mi
Ejemplo n.º 5
0
def get_metadata_(src, encoding=None):
    # Meta data definitions as in
    # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9

    if isinstance(src, bytes):
        if not encoding:
            src = xml_to_unicode(src)[0]
        else:
            src = src.decode(encoding, 'replace')
    src = src[:150000]  # Searching shouldn't take too long
    comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src)

    def get_all(field):
        ans = comment_tags.get(field, meta_tags.get(field, None))
        if ans:
            ans = [x.strip() for x in ans if x.strip()]
        if not ans:
            ans = None
        return ans

    def get(field):
        ans = get_all(field)
        if ans:
            ans = ans[0]
        return ans

    # Title
    title = get('title') or title_tag.strip() or 'Unknown'

    # Author
    authors = authors_to_string(get_all('authors')) or 'Unknown'

    # Create MetaInformation with Title and Author
    mi = Metadata(title, string_to_authors(authors))

    # Single-value text fields
    for field in ('publisher', 'isbn'):
        val = get(field)
        if val:
            setattr(mi, field, val)

    # Multi-value text fields
    for field in ('languages',):
        val = get_all(field)
        if val:
            setattr(mi, field, val)

    # HTML fields
    for field in ('comments',):
        val = get(field)
        if val:
            setattr(mi, field, val.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&apos;'))

    # Date fields
    for field in ('pubdate', 'timestamp'):
        try:
            val = parse_date(get(field))
        except:
            pass
        else:
            if not is_date_undefined(val):
                setattr(mi, field, val)

    # SERIES
    series = get('series')
    if series:
        pat = re.compile(r'\[([.0-9]+)\]$')
        match = pat.search(series)
        series_index = None
        if match is not None:
            try:
                series_index = float(match.group(1))
            except:
                pass
            series = series.replace(match.group(), '').strip()
        mi.series = series
        if series_index is None:
            series_index = get('series_index')
            try:
                series_index = float(series_index)
            except:
                pass
        if series_index is not None:
            mi.series_index = series_index

    # RATING
    rating = get('rating')
    if rating:
        try:
            mi.rating = float(rating)
            if mi.rating < 0:
                mi.rating = 0
            if mi.rating > 10:
                mi.rating = 0
        except:
            pass

    # TAGS
    tags = get_all('tags')
    if tags:
        tags = [x.strip() for s in tags for x in s.split(',') if x.strip()]
        if tags:
            mi.tags = tags

    # IDENTIFIERS
    for (k,v) in meta_tag_ids.items():
        v = [x.strip() for x in v if x.strip()]
        if v:
            mi.set_identifier(k, v[0])

    return mi
Ejemplo n.º 6
0
 def process_metadata(self, idx, content, codec):
     if idx == 100:
         if self.mi.is_null('authors'):
             self.mi.authors = []
         au = clean_xml_chars(self.decode(content).strip())
         # Author names in Amazon  MOBI files are usually in LN, FN format,
         # try to detect and auto-correct that.
         m = re.match(r'([^,]+?)\s*,\s+([^,]+)$', au.strip())
         if m is not None:
             if tweaks['author_sort_copy_method'] != 'copy':
                 self.mi.authors.append(m.group(2) + ' ' + m.group(1))
             else:
                 self.mi.authors.append(m.group())
             if self.mi.is_null('author_sort'):
                 self.mi.author_sort = m.group()
         else:
             self.mi.authors.append(au)
     elif idx == 101:
         self.mi.publisher = clean_xml_chars(self.decode(content).strip())
         if self.mi.publisher in {'Unknown', 'Unknown'}:
             self.mi.publisher = None
     elif idx == 103:
         self.mi.comments = clean_xml_chars(self.decode(content).strip())
     elif idx == 104:
         raw = check_isbn(self.decode(content).strip().replace('-', ''))
         if raw:
             self.mi.isbn = raw
     elif idx == 105:
         if not self.mi.tags:
             self.mi.tags = []
         self.mi.tags.extend([
             x.strip()
             for x in clean_xml_chars(self.decode(content)).split(';')
         ])
         self.mi.tags = uniq(self.mi.tags)
     elif idx == 106:
         try:
             self.mi.pubdate = parse_date(self.decode(content),
                                          as_utc=False)
         except Exception:
             pass
     elif idx == 108:
         self.mi.book_producer = clean_xml_chars(
             self.decode(content).strip())
     elif idx == 109:
         self.mi.rights = clean_xml_chars(self.decode(content).strip())
     elif idx == 112:  # dc:source set in some EBSP amazon samples
         try:
             content = content.decode(codec).strip()
             isig = 'urn:isbn:'
             if content.lower().startswith(isig):
                 raw = check_isbn(content[len(isig):])
                 if raw and not self.mi.isbn:
                     self.mi.isbn = raw
             elif content.startswith('calibre:'):
                 # calibre book uuid is stored here by recent calibre
                 # releases
                 cid = content[len('calibre:'):]
                 if cid:
                     self.mi.application_id = self.mi.uuid = cid
         except:
             pass
     elif idx == 113:  # ASIN or other id
         try:
             self.uuid = content.decode('ascii')
             self.mi.set_identifier('mobi-asin', self.uuid)
         except Exception:
             self.uuid = None
     elif idx == 116:
         self.start_offset, = struct.unpack(b'>L', content)
     elif idx == 121:
         self.kf8_header, = struct.unpack(b'>L', content)
         if self.kf8_header == NULL_INDEX:
             self.kf8_header = None
Ejemplo n.º 7
0
def metadata_from_xmp_packet(raw_bytes):
    root = parse_xmp_packet(raw_bytes)
    mi = Metadata('Unknown')
    title = first_alt('//dc:title', root)
    if title:
        if title.startswith(r'\376\377'):
            # corrupted XMP packet generated by Nitro PDF. See
            # https://bugs.launchpad.net/calibre/+bug/1541981
            raise ValueError('Corrupted XMP metadata packet detected, '
                             'probably generated by Nitro PDF')
        mi.title = title
    authors = multiple_sequences('//dc:creator', root)
    if authors:
        mi.authors = authors
    tags = multiple_sequences('//dc:subject', root) or multiple_sequences(
        '//pdf:Keywords', root)
    if tags:
        mi.tags = tags
    comments = first_alt('//dc:description', root)
    if comments:
        mi.comments = comments
    publishers = multiple_sequences('//dc:publisher', root)
    if publishers:
        mi.publisher = publishers[0]
    try:
        pubdate = (parse_date(first_sequence('//dc:date', root)
                              or first_simple('//xmp:CreateDate', root),
                              assume_utc=False))
    except Exception:
        pass
    else:
        mi.pubdate = pubdate
    bkp = first_simple('//xmp:CreatorTool', root)
    if bkp:
        mi.book_producer = bkp
    md = safe_parse_date(first_simple('//xmp:MetadataDate', root))
    mod = safe_parse_date(first_simple('//xmp:ModifyDate', root))
    fd = more_recent(md, mod)
    if fd is not None:
        mi.metadata_date = fd
    rating = first_simple('//calibre:rating', root)
    if rating is not None:
        try:
            rating = float(rating)
            if 0 <= rating <= 10:
                mi.rating = rating
        except (ValueError, TypeError):
            pass
    series, series_index = read_series(root)
    if series:
        mi.series, mi.series_index = series, series_index
    for x in ('title_sort', 'author_sort'):
        for elem in XPath('//calibre:' + x)(root):
            val = read_simple_property(elem)
            if val:
                setattr(mi, x, val)
                break
    for x in ('author_link_map', 'user_categories'):
        val = first_simple('//calibre:' + x, root)
        if val:
            try:
                setattr(mi, x, json.loads(val))
            except Exception:
                pass

    languages = multiple_sequences('//dc:language', root)
    if languages:
        languages = list(filter(None, map(canonicalize_lang, languages)))
        if languages:
            mi.languages = languages

    identifiers = {}
    for xmpid in XPath('//xmp:Identifier')(root):
        for scheme, value in read_xmp_identifers(xmpid):
            if scheme and value:
                identifiers[scheme.lower()] = value

    for namespace in ('prism', 'pdfx'):
        for scheme in KNOWN_ID_SCHEMES:
            if scheme not in identifiers:
                val = first_simple('//%s:%s' % (namespace, scheme), root)
                scheme = scheme.lower()
                if scheme == 'isbn':
                    val = check_isbn(val)
                elif scheme == 'doi':
                    val = check_doi(val)
                if val:
                    identifiers[scheme] = val

    # Check Dublin Core for recognizable identifier types
    for scheme, check_func in {'doi': check_doi, 'isbn': check_isbn}.items():
        if scheme not in identifiers:
            val = check_func(first_simple('//dc:identifier', root))
            if val:
                identifiers['doi'] = val

    if identifiers:
        mi.set_identifiers(identifiers)

    read_user_metadata(mi, root)

    return mi
Ejemplo n.º 8
0
def safe_parse_date(raw):
    if raw:
        try:
            return parse_date(raw)
        except Exception:
            pass
Ejemplo n.º 9
0
def get_metadata(stream, extract_cover=True):
    whitespace = re.compile(r'\s+')

    def normalize(s):
        return whitespace.sub(' ', s).strip()

    with ZipFile(stream) as zf:
        meta = zf.read('meta.xml')
        root = fromstring(meta)

        def find(field):
            ns, tag = fields[field]
            ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns})
            if ans:
                return normalize(
                    tostring(ans[0],
                             method='text',
                             encoding='unicode',
                             with_tail=False)).strip()

        mi = MetaInformation(None, [])
        title = find('title')
        if title:
            mi.title = title
        creator = find('initial-creator') or find('creator')
        if creator:
            mi.authors = string_to_authors(creator)
        desc = find('description')
        if desc:
            mi.comments = desc
        lang = find('language')
        if lang and canonicalize_lang(lang):
            mi.languages = [canonicalize_lang(lang)]
        kw = find('keyword') or find('keywords')
        if kw:
            mi.tags = [x.strip() for x in kw.split(',') if x.strip()]
        data = {}
        for tag in root.xpath('//ns0:user-defined',
                              namespaces={'ns0': fields['user-defined'][0]}):
            name = (tag.get('{%s}name' % METANS) or '').lower()
            vtype = tag.get('{%s}value-type' % METANS) or 'string'
            val = tag.text
            if name and val:
                if vtype == 'boolean':
                    val = val == 'true'
                data[name] = val
        opfmeta = False  # we need this later for the cover
        opfnocover = False
        if data.get('opf.metadata'):
            # custom metadata contains OPF information
            opfmeta = True
            if data.get('opf.titlesort', ''):
                mi.title_sort = data['opf.titlesort']
            if data.get('opf.authors', ''):
                mi.authors = string_to_authors(data['opf.authors'])
            if data.get('opf.authorsort', ''):
                mi.author_sort = data['opf.authorsort']
            if data.get('opf.isbn', ''):
                isbn = check_isbn(data['opf.isbn'])
                if isbn is not None:
                    mi.isbn = isbn
            if data.get('opf.publisher', ''):
                mi.publisher = data['opf.publisher']
            if data.get('opf.pubdate', ''):
                mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
            if data.get('opf.identifiers'):
                try:
                    mi.identifiers = json.loads(data['opf.identifiers'])
                except Exception:
                    pass
            if data.get('opf.rating'):
                try:
                    mi.rating = max(0, min(float(data['opf.rating']), 10))
                except Exception:
                    pass
            if data.get('opf.series', ''):
                mi.series = data['opf.series']
                if data.get('opf.seriesindex', ''):
                    try:
                        mi.series_index = float(data['opf.seriesindex'])
                    except Exception:
                        mi.series_index = 1.0
            if data.get('opf.language', ''):
                cl = canonicalize_lang(data['opf.language'])
                if cl:
                    mi.languages = [cl]
            opfnocover = data.get('opf.nocover', False)
        if not opfnocover:
            try:
                read_cover(stream, zf, mi, opfmeta, extract_cover)
            except Exception:
                pass  # Do not let an error reading the cover prevent reading other data

    return mi