def test_input_meta_multi(self): stream_meta = get_metadata(self.get_stream('meta_multi')) canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington', 'John Adams', 'Thomas Jefferson']) canon_meta.publisher = 'Publisher A' canon_meta.languages = ['English', 'Spanish'] canon_meta.pubdate = parse_date('2019-01-01') canon_meta.timestamp = parse_date('2018-01-01') canon_meta.series = 'Meta Series' canon_meta.series_index = float(1) canon_meta.rating = float(8) canon_meta.comments = 'meta "comments" ♥ HTML &' canon_meta.tags = ['tag a', 'tag b', 'tag c'] canon_meta.set_identifiers({'isbn': '1234567890', 'url': 'http://google.com/search?q=calibre'}) self.compare_metadata(stream_meta, canon_meta)
def test_input_comment_multi(self): stream_meta = get_metadata(self.get_stream('comment_multi')) canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe', 'John Quincy Adams']) canon_meta.publisher = 'Publisher C' canon_meta.languages = ['French', 'Japanese'] canon_meta.pubdate = parse_date('2015-01-01') canon_meta.timestamp = parse_date('2014-01-01') canon_meta.series = 'Comment Series' canon_meta.series_index = float(3) canon_meta.rating = float(0) canon_meta.comments = 'comment "comments" ♥ HTML -- too &' canon_meta.tags = ['tag d', 'tag e', 'tag f'] canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'}) self.compare_metadata(stream_meta, canon_meta)
def test_input_meta_single(self): stream_meta = get_metadata(self.get_stream('meta_single')) canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington']) canon_meta.publisher = 'Publisher A' canon_meta.languages = ['English'] canon_meta.pubdate = parse_date('2019-01-01') canon_meta.timestamp = parse_date('2018-01-01') canon_meta.series = 'Meta Series' canon_meta.series_index = float(1) # canon_meta.rating = float(0) # canon_meta.comments = '' canon_meta.tags = ['tag a', 'tag b'] canon_meta.set_identifiers({'isbn': '1234567890'}) self.compare_metadata(stream_meta, canon_meta)
def consolidate_metadata(info_mi, info): ''' When both the PDF Info dict and XMP metadata are present, prefer the xmp metadata unless the Info ModDate is never than the XMP MetadataDate. This is the algorithm recommended by the PDF spec. ''' try: raw = info['xmp_metadata'].rstrip() if not raw: return info_mi xmp_mi = metadata_from_xmp_packet(raw) except Exception: import traceback traceback.print_exc() return info_mi info_title = info_mi.title or 'Unknown' info_authors = list(info_mi.authors or ()) info_tags = list(info_mi.tags or ()) info_mi.smart_update(xmp_mi, replace_metadata=True) prefer_info = False if 'ModDate' in info and hasattr(xmp_mi, 'metadata_date'): try: info_date = parse_date(info['ModDate']) except Exception: pass else: prefer_info = info_date > xmp_mi.metadata_date if prefer_info: info_mi.title = info_title info_mi.authors = info_authors info_mi.tags = info_tags else: # We'll use the xmp tags/authors but fallback to the info ones if the # xmp does not have tags/authors. smart_update() should have taken care # of the rest info_mi.authors = (info_authors if xmp_mi.is_null('authors') else xmp_mi.authors) info_mi.tags = xmp_mi.tags or info_tags return info_mi
def get_metadata_(src, encoding=None): # Meta data definitions as in # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9 if isinstance(src, bytes): if not encoding: src = xml_to_unicode(src)[0] else: src = src.decode(encoding, 'replace') src = src[:150000] # Searching shouldn't take too long comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src) def get_all(field): ans = comment_tags.get(field, meta_tags.get(field, None)) if ans: ans = [x.strip() for x in ans if x.strip()] if not ans: ans = None return ans def get(field): ans = get_all(field) if ans: ans = ans[0] return ans # Title title = get('title') or title_tag.strip() or 'Unknown' # Author authors = authors_to_string(get_all('authors')) or 'Unknown' # Create MetaInformation with Title and Author mi = Metadata(title, string_to_authors(authors)) # Single-value text fields for field in ('publisher', 'isbn'): val = get(field) if val: setattr(mi, field, val) # Multi-value text fields for field in ('languages',): val = get_all(field) if val: setattr(mi, field, val) # HTML fields for field in ('comments',): val = get(field) if val: setattr(mi, field, val.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''')) # Date fields for field in ('pubdate', 'timestamp'): try: val = parse_date(get(field)) except: pass else: if not is_date_undefined(val): setattr(mi, field, val) # SERIES series = get('series') if series: pat = re.compile(r'\[([.0-9]+)\]$') match = pat.search(series) series_index = None if match is not None: try: series_index = float(match.group(1)) except: pass series = series.replace(match.group(), '').strip() mi.series = series if series_index is None: series_index = get('series_index') try: series_index = float(series_index) except: pass if series_index is not None: mi.series_index = series_index # RATING rating = get('rating') if rating: try: mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 if mi.rating > 10: mi.rating = 0 except: pass # TAGS tags = get_all('tags') if tags: tags = [x.strip() for s in tags for x in s.split(',') if x.strip()] if tags: mi.tags = tags # IDENTIFIERS for (k,v) in meta_tag_ids.items(): v = [x.strip() for x in v if x.strip()] if v: mi.set_identifier(k, v[0]) return mi
def process_metadata(self, idx, content, codec): if idx == 100: if self.mi.is_null('authors'): self.mi.authors = [] au = clean_xml_chars(self.decode(content).strip()) # Author names in Amazon MOBI files are usually in LN, FN format, # try to detect and auto-correct that. m = re.match(r'([^,]+?)\s*,\s+([^,]+)$', au.strip()) if m is not None: if tweaks['author_sort_copy_method'] != 'copy': self.mi.authors.append(m.group(2) + ' ' + m.group(1)) else: self.mi.authors.append(m.group()) if self.mi.is_null('author_sort'): self.mi.author_sort = m.group() else: self.mi.authors.append(au) elif idx == 101: self.mi.publisher = clean_xml_chars(self.decode(content).strip()) if self.mi.publisher in {'Unknown', 'Unknown'}: self.mi.publisher = None elif idx == 103: self.mi.comments = clean_xml_chars(self.decode(content).strip()) elif idx == 104: raw = check_isbn(self.decode(content).strip().replace('-', '')) if raw: self.mi.isbn = raw elif idx == 105: if not self.mi.tags: self.mi.tags = [] self.mi.tags.extend([ x.strip() for x in clean_xml_chars(self.decode(content)).split(';') ]) self.mi.tags = uniq(self.mi.tags) elif idx == 106: try: self.mi.pubdate = parse_date(self.decode(content), as_utc=False) except Exception: pass elif idx == 108: self.mi.book_producer = clean_xml_chars( self.decode(content).strip()) elif idx == 109: self.mi.rights = clean_xml_chars(self.decode(content).strip()) elif idx == 112: # dc:source set in some EBSP amazon samples try: content = content.decode(codec).strip() isig = 'urn:isbn:' if content.lower().startswith(isig): raw = check_isbn(content[len(isig):]) if raw and not self.mi.isbn: self.mi.isbn = raw elif content.startswith('calibre:'): # calibre book uuid is stored here by recent calibre # releases cid = content[len('calibre:'):] if cid: self.mi.application_id = self.mi.uuid = cid except: pass elif idx == 113: # ASIN or other id try: self.uuid = content.decode('ascii') self.mi.set_identifier('mobi-asin', self.uuid) except Exception: self.uuid = None elif idx == 116: self.start_offset, = struct.unpack(b'>L', content) elif idx == 121: self.kf8_header, = struct.unpack(b'>L', content) if self.kf8_header == NULL_INDEX: self.kf8_header = None
def metadata_from_xmp_packet(raw_bytes): root = parse_xmp_packet(raw_bytes) mi = Metadata('Unknown') title = first_alt('//dc:title', root) if title: if title.startswith(r'\376\377'): # corrupted XMP packet generated by Nitro PDF. See # https://bugs.launchpad.net/calibre/+bug/1541981 raise ValueError('Corrupted XMP metadata packet detected, ' 'probably generated by Nitro PDF') mi.title = title authors = multiple_sequences('//dc:creator', root) if authors: mi.authors = authors tags = multiple_sequences('//dc:subject', root) or multiple_sequences( '//pdf:Keywords', root) if tags: mi.tags = tags comments = first_alt('//dc:description', root) if comments: mi.comments = comments publishers = multiple_sequences('//dc:publisher', root) if publishers: mi.publisher = publishers[0] try: pubdate = (parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False)) except Exception: pass else: mi.pubdate = pubdate bkp = first_simple('//xmp:CreatorTool', root) if bkp: mi.book_producer = bkp md = safe_parse_date(first_simple('//xmp:MetadataDate', root)) mod = safe_parse_date(first_simple('//xmp:ModifyDate', root)) fd = more_recent(md, mod) if fd is not None: mi.metadata_date = fd rating = first_simple('//calibre:rating', root) if rating is not None: try: rating = float(rating) if 0 <= rating <= 10: mi.rating = rating except (ValueError, TypeError): pass series, series_index = read_series(root) if series: mi.series, mi.series_index = series, series_index for x in ('title_sort', 'author_sort'): for elem in XPath('//calibre:' + x)(root): val = read_simple_property(elem) if val: setattr(mi, x, val) break for x in ('author_link_map', 'user_categories'): val = first_simple('//calibre:' + x, root) if val: try: setattr(mi, x, json.loads(val)) except Exception: pass languages = multiple_sequences('//dc:language', root) if languages: languages = list(filter(None, map(canonicalize_lang, languages))) if languages: mi.languages = languages identifiers = {} for xmpid in XPath('//xmp:Identifier')(root): for scheme, value in read_xmp_identifers(xmpid): if scheme and value: identifiers[scheme.lower()] = value for namespace in ('prism', 'pdfx'): for scheme in KNOWN_ID_SCHEMES: if scheme not in identifiers: val = first_simple('//%s:%s' % (namespace, scheme), root) scheme = scheme.lower() if scheme == 'isbn': val = check_isbn(val) elif scheme == 'doi': val = check_doi(val) if val: identifiers[scheme] = val # Check Dublin Core for recognizable identifier types for scheme, check_func in {'doi': check_doi, 'isbn': check_isbn}.items(): if scheme not in identifiers: val = check_func(first_simple('//dc:identifier', root)) if val: identifiers['doi'] = val if identifiers: mi.set_identifiers(identifiers) read_user_metadata(mi, root) return mi
def safe_parse_date(raw): if raw: try: return parse_date(raw) except Exception: pass
def get_metadata(stream, extract_cover=True): whitespace = re.compile(r'\s+') def normalize(s): return whitespace.sub(' ', s).strip() with ZipFile(stream) as zf: meta = zf.read('meta.xml') root = fromstring(meta) def find(field): ns, tag = fields[field] ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns}) if ans: return normalize( tostring(ans[0], method='text', encoding='unicode', with_tail=False)).strip() mi = MetaInformation(None, []) title = find('title') if title: mi.title = title creator = find('initial-creator') or find('creator') if creator: mi.authors = string_to_authors(creator) desc = find('description') if desc: mi.comments = desc lang = find('language') if lang and canonicalize_lang(lang): mi.languages = [canonicalize_lang(lang)] kw = find('keyword') or find('keywords') if kw: mi.tags = [x.strip() for x in kw.split(',') if x.strip()] data = {} for tag in root.xpath('//ns0:user-defined', namespaces={'ns0': fields['user-defined'][0]}): name = (tag.get('{%s}name' % METANS) or '').lower() vtype = tag.get('{%s}value-type' % METANS) or 'string' val = tag.text if name and val: if vtype == 'boolean': val = val == 'true' data[name] = val opfmeta = False # we need this later for the cover opfnocover = False if data.get('opf.metadata'): # custom metadata contains OPF information opfmeta = True if data.get('opf.titlesort', ''): mi.title_sort = data['opf.titlesort'] if data.get('opf.authors', ''): mi.authors = string_to_authors(data['opf.authors']) if data.get('opf.authorsort', ''): mi.author_sort = data['opf.authorsort'] if data.get('opf.isbn', ''): isbn = check_isbn(data['opf.isbn']) if isbn is not None: mi.isbn = isbn if data.get('opf.publisher', ''): mi.publisher = data['opf.publisher'] if data.get('opf.pubdate', ''): mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True) if data.get('opf.identifiers'): try: mi.identifiers = json.loads(data['opf.identifiers']) except Exception: pass if data.get('opf.rating'): try: mi.rating = max(0, min(float(data['opf.rating']), 10)) except Exception: pass if data.get('opf.series', ''): mi.series = data['opf.series'] if data.get('opf.seriesindex', ''): try: mi.series_index = float(data['opf.seriesindex']) except Exception: mi.series_index = 1.0 if data.get('opf.language', ''): cl = canonicalize_lang(data['opf.language']) if cl: mi.languages = [cl] opfnocover = data.get('opf.nocover', False) if not opfnocover: try: read_cover(stream, zf, mi, opfmeta, extract_cover) except Exception: pass # Do not let an error reading the cover prevent reading other data return mi