class EXTHHeader(object): # {{{ def __init__(self, raw, codec, title): self.doctype = raw[:4] self.length, self.num_items = struct.unpack('>LL', raw[4:12]) raw = raw[12:] pos = 0 self.mi = MetaInformation(_('Unknown'), [_('Unknown')]) self.has_fake_cover = True self.start_offset = None left = self.num_items self.kf8_header = None self.uuid = self.cdetype = None while left > 0: left -= 1 idx, size = struct.unpack('>LL', raw[pos:pos + 8]) content = raw[pos + 8:pos + size] pos += size if idx >= 100 and idx < 200: self.process_metadata(idx, content, codec) elif idx == 203: self.has_fake_cover = bool(struct.unpack('>L', content)[0]) elif idx == 201: co, = struct.unpack('>L', content) if co < NULL_INDEX: self.cover_offset = co elif idx == 202: self.thumbnail_offset, = struct.unpack('>L', content) elif idx == 501: try: self.cdetype = content.decode('ascii') except UnicodeDecodeError: self.cdetype = None # cdetype if content == b'EBSP': if not self.mi.tags: self.mi.tags = [] self.mi.tags.append(_('Sample Book')) elif idx == 502: # last update time pass elif idx == 503: # Long title # Amazon seems to regard this as the definitive book title # rather than the title from the PDB header. In fact when # sending MOBI files through Amazon's email service if the # title contains non ASCII chars or non filename safe chars # they are messed up in the PDB header try: title = content.decode(codec) except: pass elif idx == 524: # Lang code try: lang = content.decode(codec) lang = canonicalize_lang(lang) if lang: self.mi.language = lang except: pass #else: # print 'unknown record', idx, repr(content) if title: self.mi.title = replace_entities(title) def process_metadata(self, idx, content, codec): if idx == 100: if self.mi.is_null('authors'): self.mi.authors = [] au = content.decode(codec, 'ignore').strip() self.mi.authors.append(au) if self.mi.is_null('author_sort') and re.match(r'\S+?\s*,\s+\S+', au.strip()): self.mi.author_sort = au.strip() elif idx == 101: self.mi.publisher = content.decode(codec, 'ignore').strip() if self.mi.publisher in {'Unknown', _('Unknown')}: self.mi.publisher = None elif idx == 103: self.mi.comments = content.decode(codec, 'ignore') elif idx == 104: raw = check_isbn(content.decode(codec, 'ignore').strip().replace('-', '')) if raw: self.mi.isbn = raw elif idx == 105: if not self.mi.tags: self.mi.tags = [] self.mi.tags.extend([x.strip() for x in content.decode(codec, 'ignore').split(';')]) self.mi.tags = list(set(self.mi.tags)) elif idx == 106: try: self.mi.pubdate = parse_date(content, as_utc=False) except: pass elif idx == 108: self.mi.book_producer = content.decode(codec, 'ignore').strip() elif idx == 112: # dc:source set in some EBSP amazon samples try: content = content.decode(codec).strip() isig = 'urn:isbn:' if content.lower().startswith(isig): raw = check_isbn(content[len(isig):]) if raw and not self.mi.isbn: self.mi.isbn = raw except: pass elif idx == 113: # ASIN or other id try: self.uuid = content.decode('ascii') self.mi.set_identifier('mobi-asin', self.uuid) except: self.uuid = None elif idx == 116: self.start_offset, = struct.unpack(b'>L', content) elif idx == 121: self.kf8_header, = struct.unpack(b'>L', content) if self.kf8_header == NULL_INDEX: self.kf8_header = None
class EXTHHeader(object): # {{{ def __init__(self, raw, codec, title): self.doctype = raw[:4] self.length, self.num_items = struct.unpack('>LL', raw[4:12]) raw = raw[12:] pos = 0 self.mi = MetaInformation(_('Unknown'), [_('Unknown')]) self.has_fake_cover = True self.start_offset = None left = self.num_items self.kf8_header = None self.uuid = self.cdetype = None self.page_progression_direction = None self.primary_writing_mode = None self.decode = lambda x : clean_ascii_chars(x.decode(codec, 'replace')) while left > 0: left -= 1 idx, size = struct.unpack('>LL', raw[pos:pos + 8]) content = raw[pos + 8:pos + size] pos += size if idx >= 100 and idx < 200: self.process_metadata(idx, content, codec) elif idx == 203: self.has_fake_cover = bool(struct.unpack('>L', content)[0]) elif idx == 201: co, = struct.unpack('>L', content) if co < NULL_INDEX: self.cover_offset = co elif idx == 202: self.thumbnail_offset, = struct.unpack('>L', content) elif idx == 501: try: self.cdetype = content.decode('ascii') except UnicodeDecodeError: self.cdetype = None # cdetype if content == b'EBSP': if not self.mi.tags: self.mi.tags = [] self.mi.tags.append(_('Sample Book')) elif idx == 502: # last update time pass elif idx == 503: # Long title # Amazon seems to regard this as the definitive book title # rather than the title from the PDB header. In fact when # sending MOBI files through Amazon's email service if the # title contains non ASCII chars or non filename safe chars # they are messed up in the PDB header try: title = self.decode(content) except: pass elif idx == 524: # Lang code try: lang = content.decode(codec) lang = canonicalize_lang(lang) if lang: self.mi.language = lang except: pass elif idx == 525: try: pwm = content.decode(codec) if pwm: self.primary_writing_mode = pwm except Exception: pass elif idx == 527: try: ppd = content.decode(codec) if ppd: self.page_progression_direction = ppd except Exception: pass # else: # print 'unknown record', idx, repr(content) if title: self.mi.title = replace_entities(clean_xml_chars(clean_ascii_chars(title))) def process_metadata(self, idx, content, codec): if idx == 100: if self.mi.is_null('authors'): self.mi.authors = [] au = clean_xml_chars(self.decode(content).strip()) # Author names in Amazon MOBI files are usually in LN, FN format, # try to detect and auto-correct that. m = re.match(r'([^,]+?)\s*,\s+([^,]+)$', au.strip()) if m is not None: if tweaks['author_sort_copy_method'] != 'copy': self.mi.authors.append(m.group(2) + ' ' + m.group(1)) else: self.mi.authors.append(m.group()) if self.mi.is_null('author_sort'): self.mi.author_sort = m.group() else: self.mi.authors.append(au) elif idx == 101: self.mi.publisher = clean_xml_chars(self.decode(content).strip()) if self.mi.publisher in {'Unknown', _('Unknown')}: self.mi.publisher = None elif idx == 103: self.mi.comments = clean_xml_chars(self.decode(content).strip()) elif idx == 104: raw = check_isbn(self.decode(content).strip().replace('-', '')) if raw: self.mi.isbn = raw elif idx == 105: if not self.mi.tags: self.mi.tags = [] self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')]) self.mi.tags = list(set(self.mi.tags)) elif idx == 106: try: self.mi.pubdate = parse_date(content, as_utc=False) except: pass elif idx == 108: self.mi.book_producer = clean_xml_chars(self.decode(content).strip()) elif idx == 109: self.mi.rights = clean_xml_chars(self.decode(content).strip()) elif idx == 112: # dc:source set in some EBSP amazon samples try: content = content.decode(codec).strip() isig = 'urn:isbn:' if content.lower().startswith(isig): raw = check_isbn(content[len(isig):]) if raw and not self.mi.isbn: self.mi.isbn = raw elif content.startswith('calibre:'): # calibre book uuid is stored here by recent calibre # releases cid = content[len('calibre:'):] if cid: self.mi.application_id = self.mi.uuid = cid except: pass elif idx == 113: # ASIN or other id try: self.uuid = content.decode('ascii') self.mi.set_identifier('mobi-asin', self.uuid) except: self.uuid = None elif idx == 116: self.start_offset, = struct.unpack(b'>L', content) elif idx == 121: self.kf8_header, = struct.unpack(b'>L', content) if self.kf8_header == NULL_INDEX: self.kf8_header = None
def get_metadata(stream, cover=True): with TemporaryDirectory('_pdf_metadata_read') as pdfpath: stream.seek(0) with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f: shutil.copyfileobj(stream, f) try: res = fork_job('calibre.ebooks.metadata.pdf', 'read_info', (pdfpath, bool(cover))) except WorkerError as e: prints(e.orig_tb) raise RuntimeError('Failed to run pdfinfo') info = res['result'] with open(res['stdout_stderr'], 'rb') as f: raw = f.read().strip() if raw: prints(raw) if info is None: raise ValueError('Could not read info dict from PDF') covpath = os.path.join(pdfpath, 'cover.jpg') cdata = None if cover and os.path.exists(covpath): with open(covpath, 'rb') as f: cdata = f.read() title = info.get('Title', None) or _('Unknown') au = info.get('Author', None) if au is None: au = [_('Unknown')] else: au = string_to_authors(au) mi = MetaInformation(title, au) # if isbn is not None: # mi.isbn = isbn creator = info.get('Creator', None) if creator: mi.book_producer = creator keywords = info.get('Keywords', None) mi.tags = [] if keywords: mi.tags = [x.strip() for x in keywords.split(',')] isbn = [check_isbn(x) for x in mi.tags if check_isbn(x)] if isbn: mi.isbn = isbn = isbn[0] mi.tags = [x for x in mi.tags if check_isbn(x) != isbn] subject = info.get('Subject', None) if subject: mi.tags.insert(0, subject) if 'xmp_metadata' in info: from calibre.ebooks.metadata.xmp import consolidate_metadata mi = consolidate_metadata(mi, info) # Look for recognizable identifiers in the info dict, if they were not # found in the XMP metadata for scheme, check_func in iteritems({'doi':check_doi, 'isbn':check_isbn}): if scheme not in mi.get_identifiers(): for k, v in iteritems(info): if k != 'xmp_metadata': val = check_func(v) if val: mi.set_identifier(scheme, val) break if cdata: mi.cover_data = ('jpeg', cdata) return mi
def get_metadata(stream, cover=True): with TemporaryDirectory('_pdf_metadata_read') as pdfpath: stream.seek(0) with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f: shutil.copyfileobj(stream, f) try: res = fork_job('calibre.ebooks.metadata.pdf', 'read_info', (pdfpath, bool(cover))) except WorkerError as e: prints(e.orig_tb) raise RuntimeError('Failed to run pdfinfo') info = res['result'] with open(res['stdout_stderr'], 'rb') as f: raw = f.read().strip() if raw: prints(raw) if not info: raise ValueError('Could not read info dict from PDF') covpath = os.path.join(pdfpath, 'cover.jpg') cdata = None if cover and os.path.exists(covpath): with open(covpath, 'rb') as f: cdata = f.read() title = info.get('Title', None) au = info.get('Author', None) if au is None: au = [_('Unknown')] else: au = string_to_authors(au) mi = MetaInformation(title, au) # if isbn is not None: # mi.isbn = isbn creator = info.get('Creator', None) if creator: mi.book_producer = creator keywords = info.get('Keywords', None) mi.tags = [] if keywords: mi.tags = [x.strip() for x in keywords.split(',')] isbn = [check_isbn(x) for x in mi.tags if check_isbn(x)] if isbn: mi.isbn = isbn = isbn[0] mi.tags = [x for x in mi.tags if check_isbn(x) != isbn] subject = info.get('Subject', None) if subject: mi.tags.insert(0, subject) if 'xmp_metadata' in info: from calibre.ebooks.metadata.xmp import consolidate_metadata mi = consolidate_metadata(mi, info) # Look for recognizable identifiers in the info dict, if they were not # found in the XMP metadata for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems(): if scheme not in mi.get_identifiers(): for k, v in info.iteritems(): if k != 'xmp_metadata': val = check_func(v) if val: mi.set_identifier(scheme, val) break if cdata: mi.cover_data = ('jpeg', cdata) return mi