def metadata(self): mi = Metadata('Unknown') dp_name, ap_name = self.get_document_properties_names() if dp_name: try: raw = self.read(dp_name) except KeyError: pass else: read_doc_props(raw, mi, self.namespace.XPath) if mi.is_null('language'): try: raw = self.read('word/styles.xml') except KeyError: pass else: read_default_style_language(raw, mi, self.namespace.XPath) ap_name = self.relationships.get(self.namespace.names['APPPROPS'], None) if ap_name: try: raw = self.read(ap_name) except KeyError: pass else: read_app_props(raw, mi) return mi
def convert_markdown_with_metadata(txt, title='', extensions=DEFAULT_MD_EXTENSIONS): from ebook_converter.ebooks.metadata.book.base import Metadata from ebook_converter.utils.date import parse_only_date from ebook_converter.db.write import get_series_values if 'meta' not in extensions: extensions.append('meta') md = create_markdown_object(extensions) html = md.convert(txt) mi = Metadata(title or 'Unknown') m = md.Meta for k, v in {'date':'pubdate', 'summary':'comments'}.items(): if v not in m and k in m: m[v] = m.pop(k) for k in 'title authors series tags pubdate comments publisher rating'.split(): val = m.get(k) if val: mf = mi.metadata_for_field(k) if not mf.get('is_multiple'): val = val[0] if k == 'series': val, si = get_series_values(val) mi.series_index = 1 if si is None else si if k == 'rating': try: val = max(0, min(int(float(val)), 10)) except Exception: continue if mf.get('datatype') == 'datetime': try: val = parse_only_date(val, assume_utc=False) except Exception: continue setattr(mi, k, val) return mi, HTML_TEMPLATE % (mi.title, html)
def test_rtf_metadata(self): stream = BytesIO(br'{\rtf1\ansi\ansicpg1252}') m = Metadata('Test ø̄title', ['Author One', 'Author БTwo']) m.tags = 'tag1 見tag2'.split() m.comments = '<p>some ⊹comments</p>' m.publisher = 'publiSher' set_metadata(stream, m) stream.seek(0) o = get_metadata(stream) for attr in 'title authors publisher comments tags'.split(): self.assertEqual(getattr(m, attr), getattr(o, attr))
def generate_test_db( library_path, # {{{ num_of_records=20000, num_of_authors=6000, num_of_tags=10000, tag_length=7, author_length=7, title_length=10, max_authors=10, max_tags=10): import random, string, os, sys, time from ebook_converter.constants_old import preferred_encoding if not os.path.exists(library_path): os.makedirs(library_path) letters = string.letters.decode(preferred_encoding) def randstr(length): return ''.join(random.choice(letters) for i in range(length)) all_tags = [randstr(tag_length) for j in range(num_of_tags)] print('Generated', num_of_tags, 'tags') all_authors = [randstr(author_length) for j in range(num_of_authors)] print('Generated', num_of_authors, 'authors') all_titles = [randstr(title_length) for j in range(num_of_records)] print('Generated', num_of_records, 'titles') testdb = db(library_path) print('Creating', num_of_records, 'records...') start = time.time() for i, title in enumerate(all_titles): print(i + 1, end=' ') sys.stdout.flush() authors = random.randint(1, max_authors) authors = [random.choice(all_authors) for i in range(authors)] tags = random.randint(0, max_tags) tags = [random.choice(all_tags) for i in range(tags)] from ebook_converter.ebooks.metadata.book.base import Metadata mi = Metadata(title, authors) mi.tags = tags testdb.import_book(mi, []) t = time.time() - start print('\nGenerated', num_of_records, 'records in:', t, 'seconds') print('Time per record:', t / num_of_records)
def convert(self, stream, options, file_ext, log, accelerators): # NOTE(gryf): for some reason, those import cannot be moved to the top # of module. from ebook_converter.ebooks.chm.metadata import get_metadata_from_reader from ebook_converter.customize.ui import plugin_for_input_format self.opts = options log.debug('Processing CHM...') with TemporaryDirectory('_chm2oeb') as tdir: if not isinstance(tdir, str): tdir = tdir.decode(filesystem_encoding) html_input = plugin_for_input_format('html') for opt in html_input.options: setattr(options, opt.option.name, opt.recommended_value) no_images = False # options.no_images chm_name = stream.name # chm_data = stream.read() # closing stream so CHM can be opened by external library stream.close() log.debug('tdir=%s', tdir) log.debug('stream.name=%s', stream.name) debug_dump = False odi = options.debug_pipeline if odi: debug_dump = os.path.join(odi, 'input') mainname = self._chmtohtml(tdir, chm_name, no_images, log, debug_dump=debug_dump) mainpath = os.path.join(tdir, mainname) try: metadata = get_metadata_from_reader(self._chm_reader) except Exception: log.exception('Failed to read metadata, using filename') from ebook_converter.ebooks.metadata.book.base import Metadata metadata = Metadata(os.path.basename(chm_name)) encoding = (self._chm_reader.get_encoding() or options.input_encoding or 'cp1252') self._chm_reader.CloseCHM() options.debug_pipeline = None options.input_encoding = 'utf-8' uenc = encoding if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files: uenc = 'utf-8' htmlpath, toc = self._create_html_root(mainpath, log, uenc) oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata) options.debug_pipeline = odi if toc.count() > 1: oeb.toc = self.parse_html_toc(oeb.spine[0]) oeb.manifest.remove(oeb.spine[0]) oeb.auto_generated_toc = False return oeb
def MetaInformation(title, authors=('Unknown', )): ''' Convenient encapsulation of book metadata, needed for compatibility @param title: title or ``'Unknown'`` or a MetaInformation object @param authors: List of strings or [] ''' from ebook_converter.ebooks.metadata.book.base import Metadata mi = None if hasattr(title, 'title') and hasattr(title, 'authors'): mi = title title = mi.title authors = mi.authors return Metadata(title, authors, other=mi)
def test_input_comment_multi(self): stream_meta = get_metadata(self.get_stream('comment_multi')) canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe', 'John Quincy Adams']) canon_meta.publisher = 'Publisher C' canon_meta.languages = ['French', 'Japanese'] canon_meta.pubdate = parse_date('2015-01-01') canon_meta.timestamp = parse_date('2014-01-01') canon_meta.series = 'Comment Series' canon_meta.series_index = float(3) canon_meta.rating = float(0) canon_meta.comments = 'comment "comments" ♥ HTML -- too &amp;' canon_meta.tags = ['tag d', 'tag e', 'tag f'] canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'}) self.compare_metadata(stream_meta, canon_meta)
def test_input_meta_multi(self): stream_meta = get_metadata(self.get_stream('meta_multi')) canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington', 'John Adams', 'Thomas Jefferson']) canon_meta.publisher = 'Publisher A' canon_meta.languages = ['English', 'Spanish'] canon_meta.pubdate = parse_date('2019-01-01') canon_meta.timestamp = parse_date('2018-01-01') canon_meta.series = 'Meta Series' canon_meta.series_index = float(1) canon_meta.rating = float(8) canon_meta.comments = 'meta "comments" ♥ HTML &amp;' canon_meta.tags = ['tag a', 'tag b', 'tag c'] canon_meta.set_identifiers({'isbn': '1234567890', 'url': 'http://google.com/search?q=calibre'}) self.compare_metadata(stream_meta, canon_meta)
def test_input_meta_single(self): stream_meta = get_metadata(self.get_stream('meta_single')) canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington']) canon_meta.publisher = 'Publisher A' canon_meta.languages = ['English'] canon_meta.pubdate = parse_date('2019-01-01') canon_meta.timestamp = parse_date('2018-01-01') canon_meta.series = 'Meta Series' canon_meta.series_index = float(1) # canon_meta.rating = float(0) # canon_meta.comments = '' canon_meta.tags = ['tag a', 'tag b'] canon_meta.set_identifiers({'isbn': '1234567890'}) self.compare_metadata(stream_meta, canon_meta)
def test_input_title(self): stream_meta = get_metadata(self.get_stream('title')) canon_meta = Metadata('A Title Tag & Title Ⓒ', ['Unknown']) self.compare_metadata(stream_meta, canon_meta)
def get_metadata_(src, encoding=None): # Meta data definitions as in # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9 if isinstance(src, bytes): if not encoding: src = xml_to_unicode(src)[0] else: src = src.decode(encoding, 'replace') src = src[:150000] # Searching shouldn't take too long comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src) def get_all(field): ans = comment_tags.get(field, meta_tags.get(field, None)) if ans: ans = [x.strip() for x in ans if x.strip()] if not ans: ans = None return ans def get(field): ans = get_all(field) if ans: ans = ans[0] return ans # Title title = get('title') or title_tag.strip() or 'Unknown' # Author authors = authors_to_string(get_all('authors')) or 'Unknown' # Create MetaInformation with Title and Author mi = Metadata(title, string_to_authors(authors)) # Single-value text fields for field in ('publisher', 'isbn'): val = get(field) if val: setattr(mi, field, val) # Multi-value text fields for field in ('languages',): val = get_all(field) if val: setattr(mi, field, val) # HTML fields for field in ('comments',): val = get(field) if val: setattr(mi, field, val.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''')) # Date fields for field in ('pubdate', 'timestamp'): try: val = parse_date(get(field)) except: pass else: if not is_date_undefined(val): setattr(mi, field, val) # SERIES series = get('series') if series: pat = re.compile(r'\[([.0-9]+)\]$') match = pat.search(series) series_index = None if match is not None: try: series_index = float(match.group(1)) except: pass series = series.replace(match.group(), '').strip() mi.series = series if series_index is None: series_index = get('series_index') try: series_index = float(series_index) except: pass if series_index is not None: mi.series_index = series_index # RATING rating = get('rating') if rating: try: mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 if mi.rating > 10: mi.rating = 0 except: pass # TAGS tags = get_all('tags') if tags: tags = [x.strip() for s in tags for x in s.split(',') if x.strip()] if tags: mi.tags = tags # IDENTIFIERS for (k,v) in meta_tag_ids.items(): v = [x.strip() for x in v if x.strip()] if v: mi.set_identifier(k, v[0]) return mi
def validate(self, x): from ebook_converter.ebooks.metadata.book.base import Metadata return self.safe_format(x, {}, 'VALIDATE ERROR', Metadata(''))
def metadata_from_xmp_packet(raw_bytes): root = parse_xmp_packet(raw_bytes) mi = Metadata('Unknown') title = first_alt('//dc:title', root) if title: if title.startswith(r'\376\377'): # corrupted XMP packet generated by Nitro PDF. See # https://bugs.launchpad.net/calibre/+bug/1541981 raise ValueError('Corrupted XMP metadata packet detected, ' 'probably generated by Nitro PDF') mi.title = title authors = multiple_sequences('//dc:creator', root) if authors: mi.authors = authors tags = multiple_sequences('//dc:subject', root) or multiple_sequences( '//pdf:Keywords', root) if tags: mi.tags = tags comments = first_alt('//dc:description', root) if comments: mi.comments = comments publishers = multiple_sequences('//dc:publisher', root) if publishers: mi.publisher = publishers[0] try: pubdate = (parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False)) except Exception: pass else: mi.pubdate = pubdate bkp = first_simple('//xmp:CreatorTool', root) if bkp: mi.book_producer = bkp md = safe_parse_date(first_simple('//xmp:MetadataDate', root)) mod = safe_parse_date(first_simple('//xmp:ModifyDate', root)) fd = more_recent(md, mod) if fd is not None: mi.metadata_date = fd rating = first_simple('//calibre:rating', root) if rating is not None: try: rating = float(rating) if 0 <= rating <= 10: mi.rating = rating except (ValueError, TypeError): pass series, series_index = read_series(root) if series: mi.series, mi.series_index = series, series_index for x in ('title_sort', 'author_sort'): for elem in XPath('//calibre:' + x)(root): val = read_simple_property(elem) if val: setattr(mi, x, val) break for x in ('author_link_map', 'user_categories'): val = first_simple('//calibre:' + x, root) if val: try: setattr(mi, x, json.loads(val)) except Exception: pass languages = multiple_sequences('//dc:language', root) if languages: languages = list(filter(None, map(canonicalize_lang, languages))) if languages: mi.languages = languages identifiers = {} for xmpid in XPath('//xmp:Identifier')(root): for scheme, value in read_xmp_identifers(xmpid): if scheme and value: identifiers[scheme.lower()] = value for namespace in ('prism', 'pdfx'): for scheme in KNOWN_ID_SCHEMES: if scheme not in identifiers: val = first_simple('//%s:%s' % (namespace, scheme), root) scheme = scheme.lower() if scheme == 'isbn': val = check_isbn(val) elif scheme == 'doi': val = check_doi(val) if val: identifiers[scheme] = val # Check Dublin Core for recognizable identifier types for scheme, check_func in {'doi': check_doi, 'isbn': check_isbn}.items(): if scheme not in identifiers: val = check_func(first_simple('//dc:identifier', root)) if val: identifiers['doi'] = val if identifiers: mi.set_identifiers(identifiers) read_user_metadata(mi, root) return mi