def get_metadata(stream, extract_cover=True): zin = zipfile.ZipFile(stream, 'r') odfs = odfmetaparser() parser = xml.sax.make_parser() parser.setFeature(xml.sax.handler.feature_namespaces, 1) parser.setContentHandler(odfs) content = zin.read('meta.xml') parser.parse(StringIO(content)) data = odfs.seenfields mi = MetaInformation(None, []) if data.has_key('title'): mi.title = data['title'] if data.get('initial-creator', '').strip(): mi.authors = string_to_authors(data['initial-creator']) elif data.has_key('creator'): mi.authors = string_to_authors(data['creator']) if data.has_key('description'): mi.comments = data['description'] if data.has_key('language'): mi.language = data['language'] if data.get('keywords', ''): mi.tags = [x.strip() for x in data['keywords'].split(',') if x.strip()] opfmeta = False # we need this later for the cover opfnocover = False if data.get('opf.metadata','') == 'true': # custom metadata contains OPF information opfmeta = True if data.get('opf.titlesort', ''): mi.title_sort = data['opf.titlesort'] if data.get('opf.authors', ''): mi.authors = string_to_authors(data['opf.authors']) if data.get('opf.authorsort', ''): mi.author_sort = data['opf.authorsort'] if data.get('opf.isbn', ''): isbn = check_isbn(data['opf.isbn']) if isbn is not None: mi.isbn = isbn if data.get('opf.publisher', ''): mi.publisher = data['opf.publisher'] if data.get('opf.pubdate', ''): mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True) if data.get('opf.series', ''): mi.series = data['opf.series'] if data.get('opf.seriesindex', ''): try: mi.series_index = float(data['opf.seriesindex']) except ValueError: mi.series_index = 1.0 if data.get('opf.language', ''): cl = canonicalize_lang(data['opf.language']) if cl: mi.languages = [cl] opfnocover = data.get('opf.nocover', 'false') == 'true' if not opfnocover: try: read_cover(stream, zin, mi, opfmeta, extract_cover) except: pass # Do not let an error reading the cover prevent reading other data return mi
def get_metadata(stream, extract_cover=True): zin = zipfile.ZipFile(stream, 'r') odfs = odfmetaparser() parser = xml.sax.make_parser() parser.setFeature(xml.sax.handler.feature_namespaces, 1) parser.setContentHandler(odfs) content = zin.read('meta.xml') parser.parse(StringIO(content)) data = odfs.seenfields mi = MetaInformation(None, []) if 'title' in data: mi.title = data['title'] if data.get('initial-creator', '').strip(): mi.authors = string_to_authors(data['initial-creator']) elif 'creator' in data: mi.authors = string_to_authors(data['creator']) if 'description' in data: mi.comments = data['description'] if 'language' in data: mi.language = data['language'] if data.get('keywords', ''): mi.tags = [x.strip() for x in data['keywords'].split(',') if x.strip()] opfmeta = False # we need this later for the cover opfnocover = False if data.get('opf.metadata','') == 'true': # custom metadata contains OPF information opfmeta = True if data.get('opf.titlesort', ''): mi.title_sort = data['opf.titlesort'] if data.get('opf.authors', ''): mi.authors = string_to_authors(data['opf.authors']) if data.get('opf.authorsort', ''): mi.author_sort = data['opf.authorsort'] if data.get('opf.isbn', ''): isbn = check_isbn(data['opf.isbn']) if isbn is not None: mi.isbn = isbn if data.get('opf.publisher', ''): mi.publisher = data['opf.publisher'] if data.get('opf.pubdate', ''): mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True) if data.get('opf.series', ''): mi.series = data['opf.series'] if data.get('opf.seriesindex', ''): try: mi.series_index = float(data['opf.seriesindex']) except ValueError: mi.series_index = 1.0 if data.get('opf.language', ''): cl = canonicalize_lang(data['opf.language']) if cl: mi.languages = [cl] opfnocover = data.get('opf.nocover', 'false') == 'true' if not opfnocover: try: read_cover(stream, zin, mi, opfmeta, extract_cover) except: pass # Do not let an error reading the cover prevent reading other data return mi
def get_metadata_(path, cpath=None): p = podofo.PDFDoc() p.open(path) title = p.title if not title: title = '_' author = p.author authors = string_to_authors(author) if author else [_('Unknown')] creator = p.creator try: tags = [x.strip() for x in p.keywords.split(u',')] tags = [x for x in tags if x] except: tags = [] ok = True try: if cpath is not None: pages = p.pages if pages < 1: raise ValueError('PDF has no pages') if True or pages == 1: shutil.copyfile(path, cpath) else: p.extract_first_page() p.save(cpath) except: import traceback traceback.print_exc() ok = False return (title, authors, creator, tags, ok)
def get_metadata(stream): """ Return metadata as a L{MetaInfo} object """ title = 'Unknown' mi = MetaInformation(title, ['Unknown']) stream.seek(0) try: if stream.read(10) not in MAGIC: print >>sys.stderr, u'Couldn\'t read IMP header from file' return mi def cString(skip=0): result = '' while 1: data = stream.read(1) if data == '\x00': if not skip: return result skip -= 1 result, data = '', '' result += data stream.read(38) # skip past some uninteresting headers _, category, title, author = cString(), cString(), cString(1), cString(2) if title: mi.title = title if author: mi.authors = string_to_authors(author) mi.author = author if category: mi.category = category except Exception as err: msg = u'Couldn\'t read metadata from imp: %s with error %s'%(mi.title, unicode(err)) print >>sys.stderr, msg.encode('utf8') return mi
def cell_changed(self, row, col): id_ = int(self.table.item(row, 0).data(Qt.UserRole)) if col == 0: item = self.table.item(row, 0) aut = unicode_type(item.text()).strip() aut_list = string_to_authors(aut) if len(aut_list) != 1: error_dialog( self.parent(), _('Invalid author name'), _('You cannot change an author to multiple authors.') ).exec_() aut = ' % '.join(aut_list) self.table.item(row, 0).setText(aut) item.set_sort_key() self.authors[id_]['name'] = aut self.set_icon(item, id_) c = self.table.item(row, 1) txt = author_to_author_sort(aut) self.authors[id_]['sort'] = txt c.setText(txt) # This triggers another cellChanged event item = c else: item = self.table.item(row, col) item.set_sort_key() self.set_icon(item, id_) self.authors[id_][self.get_column_name(col)] = unicode_type( item.text()) self.table.setCurrentItem(item) self.table.scrollToItem(item)
def _read_doc_props(raw, mi): from calibre.ebooks.metadata import string_to_authors root = etree.fromstring(raw, parser=RECOVER_PARSER) titles = XPath('//dc:title')(root) if titles: title = titles[0].text if title and title.strip(): mi.title = title.strip() tags = [] for subject in XPath('//dc:subject')(root): if subject.text and subject.text.strip(): tags.append(subject.text.strip().replace(',', '_')) for keywords in XPath('//cp:keywords')(root): if keywords.text and keywords.text.strip(): for x in keywords.text.split(): tags.extend(y.strip() for y in x.split(',')) if tags: mi.tags = tags authors = XPath('//dc:creator')(root) aut = [] for author in authors: if author.text and author.text.strip(): aut.extend(string_to_authors(author.text)) if aut: mi.authors = aut desc = XPath('//dc:description')(root) if desc: raw = etree.tostring(desc[0], method='text', encoding=unicode) mi.comments = raw
def get_title_and_authors(self): title = unicode(self.title.text()).strip() if not title: title = _('Unknown') authors = unicode(self.author.text()).strip() authors = string_to_authors(authors) if authors else [_('Unknown')] return title, authors
def _get_authors(soup): aut = (_metadata_from_span(soup, r'author') or _metadata_from_table(soup, r'^\s*by\s*:?\s+')) ans = [_('Unknown')] if aut is not None: ans = string_to_authors(aut) return ans
def get_title_and_authors(self): title = str(self.title.text()).strip() if not title: title = _('Unknown') authors = str(self.author.text()).strip() authors = string_to_authors(authors) if authors else [_('Unknown')] return title, authors
def get_djvu_metadata(stream, cover=True): with TemporaryDirectory('_djvu_metadata_read') as djvupath: stream.seek(0) with open(os.path.join(djvupath, 'src.djvu'), 'wb') as f: shutil.copyfileobj(stream, f) try: res = fork_job('calibre_plugins.djvu_metadata.djvu', 'get_djvu_metadata_worker', (djvupath, bool(cover))) except WorkerError as e: prints(e.orig_tb) raise RuntimeError('Failed to run djvused') info = res['result'] with open(res['stdout_stderr'], 'rb') as f: raw = f.read().strip() if raw: prints(raw) if info is None: raise ValueError('Could not read metadata from djvu') covpath = os.path.join(djvupath, 'cover.jpg') cdata = None if cover and os.path.exists(covpath): with open(covpath, 'rb') as f: cdata = f.read() title = info.get('Title', None) au = info.get('Author', None) if au is None: au = [_('Unknown')] else: au = string_to_authors(au) mi = MetaInformation(title, au) if cdata: mi.cover_data = ('jpg', cdata) return mi
def cell_changed(self, row, col): id_ = int(self.table.item(row, 0).data(Qt.UserRole)) if col == 0: item = self.table.item(row, 0) item.setIcon(self.edited_icon) aut = unicode_type(item.text()).strip() aut_list = string_to_authors(aut) if len(aut_list) != 1: error_dialog( self.parent(), _('Invalid author name'), _('You cannot change an author to multiple authors.') ).exec_() aut = ' % '.join(aut_list) self.table.item(row, 0).setText(aut) self.authors[id_]['name'] = aut c = self.table.item(row, 1) txt = author_to_author_sort(aut) c.setText(txt) self.authors[id_]['sort'] = txt item = c else: item = self.table.item(row, col) item.setIcon(self.edited_icon) if col == 1: self.authors[id_]['sort'] = unicode_type(item.text()) else: self.authors[id_]['link'] = unicode_type(item.text()) self.table.setCurrentItem(item) self.table.scrollToItem(item)
def main(args=sys.argv): parser = option_parser() opts, args = parser.parse_args(args) buf = BytesIO() log = create_log(buf) abort = Event() patch_plugins() authors = [] if opts.authors: authors = string_to_authors(opts.authors) identifiers = {} if opts.isbn: identifiers['isbn'] = opts.isbn allowed_plugins = frozenset(opts.allowed_plugin) results = identify(log, abort, title=opts.title, authors=authors, identifiers=identifiers, timeout=int(opts.timeout), allowed_plugins=allowed_plugins or None) if not results: print(log, file=sys.stderr) prints('No results found', file=sys.stderr) raise SystemExit(1) result = results[0] cf = None if opts.cover and results: cover = download_cover(log, title=opts.title, authors=authors, identifiers=result.identifiers, timeout=int(opts.timeout)) if cover is None and not opts.opf: prints('No cover found', file=sys.stderr) else: save_cover_data_to(cover[-1], opts.cover) result.cover = cf = opts.cover log = buf.getvalue() result = (metadata_to_opf(result) if opts.opf else unicode(result).encode('utf-8')) if opts.verbose: print(log, file=sys.stderr) print(result) if not opts.opf and opts.cover: prints('Cover :', cf) return 0
def main(args=sys.argv): parser = option_parser() opts, args = parser.parse_args(args) buf = BytesIO() log = create_log(buf) abort = Event() patch_plugins() authors = [] if opts.authors: authors = string_to_authors(opts.authors) identifiers = {} for idspec in opts.identifier: k, v = idspec.partition(':')[::2] if not k or not v: raise SystemExit('Not a valid identifier: {}'.format(idspec)) identifiers[k] = v if opts.isbn: identifiers['isbn'] = opts.isbn allowed_plugins = frozenset(opts.allowed_plugin) results = identify(log, abort, title=opts.title, authors=authors, identifiers=identifiers, timeout=int(opts.timeout), allowed_plugins=allowed_plugins or None) if not results: prints(buf.getvalue(), file=sys.stderr) prints('No results found', file=sys.stderr) raise SystemExit(1) result = results[0] cf = None if opts.cover and results: cover = download_cover(log, title=opts.title, authors=authors, identifiers=result.identifiers, timeout=int(opts.timeout)) if cover is None: if not opts.opf: prints('No cover found', file=sys.stderr) else: save_cover_data_to(cover[-1], opts.cover) result.cover = cf = opts.cover if opts.verbose: prints(buf.getvalue(), file=sys.stderr) if opts.opf: getattr(sys.stdout, 'buffer', sys.stdout).write(metadata_to_opf(result)) print() else: prints(str(result)) if not opts.opf and opts.cover: prints('Cover :', cf) return 0
def main(args=sys.argv): parser = option_parser() opts, args = parser.parse_args(args) buf = BytesIO() log = create_log(buf) abort = Event() patch_plugins() authors = [] if opts.authors: authors = string_to_authors(opts.authors) identifiers = {} for idspec in opts.identifier: k, v = idspec.partition(':')[::2] if not k or not v: raise SystemExit('Not a valid identifier: {}'.format(idspec)) identifiers[k] = v if opts.isbn: identifiers['isbn'] = opts.isbn allowed_plugins = frozenset(opts.allowed_plugin) results = identify(log, abort, title=opts.title, authors=authors, identifiers=identifiers, timeout=int(opts.timeout), allowed_plugins=allowed_plugins or None) if not results: print(log, file=sys.stderr) prints('No results found', file=sys.stderr) raise SystemExit(1) result = results[0] cf = None if opts.cover and results: cover = download_cover(log, title=opts.title, authors=authors, identifiers=result.identifiers, timeout=int(opts.timeout)) if cover is None and not opts.opf: prints('No cover found', file=sys.stderr) else: save_cover_data_to(cover[-1], opts.cover) result.cover = cf = opts.cover log = buf.getvalue() result = (metadata_to_opf(result) if opts.opf else unicode_type(result).encode('utf-8')) if opts.verbose: print(log, file=sys.stderr) print(result) if not opts.opf and opts.cover: prints('Cover :', cf) return 0
def get_metadata(stream, cover=True): with TemporaryDirectory('_pdf_metadata_read') as pdfpath: stream.seek(0) with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f: shutil.copyfileobj(stream, f) try: res = fork_job('calibre.ebooks.metadata.pdf', 'read_info', (pdfpath, bool(cover))) except WorkerError as e: prints(e.orig_tb) raise RuntimeError('Failed to run pdfinfo') info = res['result'] with open(res['stdout_stderr'], 'rb') as f: raw = f.read().strip() if raw: prints(raw) if not info: raise ValueError('Could not read info dict from PDF') covpath = os.path.join(pdfpath, 'cover.jpg') cdata = None if cover and os.path.exists(covpath): with open(covpath, 'rb') as f: cdata = f.read() title = info.get('Title', None) au = info.get('Author', None) if au is None: au = [_('Unknown')] else: au = string_to_authors(au) mi = MetaInformation(title, au) # if isbn is not None: # mi.isbn = isbn creator = info.get('Creator', None) if creator: mi.book_producer = creator keywords = info.get('Keywords', None) mi.tags = [] if keywords: mi.tags = [x.strip() for x in keywords.split(',')] isbn = [check_isbn(x) for x in mi.tags if check_isbn(x)] if isbn: mi.isbn = isbn = isbn[0] mi.tags = [x for x in mi.tags if check_isbn(x) != isbn] subject = info.get('Subject', None) if subject: mi.tags.insert(0, subject) if cdata: mi.cover_data = ('jpeg', cdata) return mi
def get_metadata(stream, cover=True): with TemporaryDirectory("_pdf_metadata_read") as pdfpath: stream.seek(0) with open(os.path.join(pdfpath, "src.pdf"), "wb") as f: shutil.copyfileobj(stream, f) try: res = fork_job("calibre.ebooks.metadata.pdf", "read_info", (pdfpath, bool(cover))) except WorkerError as e: prints(e.orig_tb) raise RuntimeError("Failed to run pdfinfo") info = res["result"] with open(res["stdout_stderr"], "rb") as f: raw = f.read().strip() if raw: prints(raw) if not info: raise ValueError("Could not read info dict from PDF") covpath = os.path.join(pdfpath, "cover.jpg") cdata = None if cover and os.path.exists(covpath): with open(covpath, "rb") as f: cdata = f.read() title = info.get("Title", None) au = info.get("Author", None) if au is None: au = [_("Unknown")] else: au = string_to_authors(au) mi = MetaInformation(title, au) # if isbn is not None: # mi.isbn = isbn creator = info.get("Creator", None) if creator: mi.book_producer = creator keywords = info.get("Keywords", None) mi.tags = [] if keywords: mi.tags = [x.strip() for x in keywords.split(",")] isbn = [check_isbn(x) for x in mi.tags if check_isbn(x)] if isbn: mi.isbn = isbn = isbn[0] mi.tags = [x for x in mi.tags if check_isbn(x) != isbn] subject = info.get("Subject", None) if subject: mi.tags.insert(0, subject) if cdata: mi.cover_data = ("jpeg", cdata) return mi
def do_set_metadata(opts, mi, stream, stream_type): mi = MetaInformation(mi) for x in ('guide', 'toc', 'manifest', 'spine'): setattr(mi, x, None) from_opf = getattr(opts, 'from_opf', None) if from_opf is not None: from calibre.ebooks.metadata.opf2 import OPF opf_mi = OPF(open(from_opf, 'rb')).to_book_metadata() mi.smart_update(opf_mi) for pref in config().option_set.preferences: if pref.name in ('to_opf', 'from_opf', 'authors', 'title_sort', 'author_sort', 'get_cover', 'cover', 'tags', 'lrf_bookid', 'identifiers'): continue val = getattr(opts, pref.name, None) if val is not None: setattr(mi, pref.name, val) if getattr(opts, 'authors', None) is not None: mi.authors = string_to_authors(opts.authors) mi.author_sort = authors_to_sort_string(mi.authors) if getattr(opts, 'author_sort', None) is not None: mi.author_sort = opts.author_sort if getattr(opts, 'title_sort', None) is not None: mi.title_sort = opts.title_sort elif getattr(opts, 'title', None) is not None: mi.title_sort = title_sort(opts.title) if getattr(opts, 'tags', None) is not None: mi.tags = [t.strip() for t in opts.tags.split(',')] if getattr(opts, 'series', None) is not None: mi.series = opts.series.strip() if getattr(opts, 'series_index', None) is not None: mi.series_index = float(opts.series_index.strip()) if getattr(opts, 'pubdate', None) is not None: mi.pubdate = parse_date(opts.pubdate, assume_utc=False, as_utc=False) if getattr(opts, 'identifiers', None): val = { k.strip(): v.strip() for k, v in (x.partition(':')[0::2] for x in opts.identifiers) } if val: orig = mi.get_identifiers() orig.update(val) val = {k: v for k, v in orig.iteritems() if k and v} mi.set_identifiers(val) if getattr(opts, 'cover', None) is not None: ext = os.path.splitext(opts.cover)[1].replace('.', '').upper() mi.cover_data = (ext, open(opts.cover, 'rb').read()) with force_identifiers: set_metadata(stream, mi, stream_type)
def get_metadata(stream, cover=True): with TemporaryDirectory('_pdf_metadata_read') as pdfpath: stream.seek(0) with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f: shutil.copyfileobj(stream, f) try: res = fork_job('calibre.ebooks.metadata.pdf', 'read_info', (pdfpath, bool(cover))) except WorkerError as e: prints(e.orig_tb) raise RuntimeError('Failed to run pdfinfo') info = res['result'] with open(res['stdout_stderr'], 'rb') as f: raw = f.read().strip() if raw: prints(raw) if not info: raise ValueError('Could not read info dict from PDF') covpath = os.path.join(pdfpath, 'cover.jpg') cdata = None if cover and os.path.exists(covpath): with open(covpath, 'rb') as f: cdata = f.read() title = info.get('Title', None) au = info.get('Author', None) if au is None: au = [_('Unknown')] else: au = string_to_authors(au) mi = MetaInformation(title, au) #if isbn is not None: # mi.isbn = isbn creator = info.get('Creator', None) if creator: mi.book_producer = creator keywords = info.get('Keywords', None) mi.tags = [] if keywords: mi.tags = [x.strip() for x in keywords.split(',')] subject = info.get('Subject', None) if subject: mi.tags.insert(0, subject) if cdata: mi.cover_data = ('jpeg', cdata) return mi
def apply_metadata(root, mi, cover_prefix='', cover_data=None, apply_null=False, update_timestamp=False, force_identifiers=False): prefixes, refines = read_prefixes(root), read_refines(root) set_identifiers(root, prefixes, refines, mi.identifiers, force_identifiers=force_identifiers) set_title(root, prefixes, refines, mi.title, mi.title_sort) set_languages(root, prefixes, refines, mi.languages) aus = string_to_authors(mi.author_sort or '') authors = [] for i, aut in enumerate(mi.authors): authors.append(Author(aut, aus[i] if i < len(aus) else None)) set_authors(root, prefixes, refines, authors) set_pubdate(root, prefixes, refines, mi.pubdate) set_timestamp(root, prefixes, refines, mi.timestamp) pretty_print_opf(root)
def main(args=sys.argv): parser = option_parser() opts, args = parser.parse_args(args) buf = BytesIO() log = create_log(buf) abort = Event() authors = [] if opts.authors: authors = string_to_authors(opts.authors) identifiers = {} if opts.isbn: identifiers['isbn'] = opts.isbn results = identify(log, abort, title=opts.title, authors=authors, identifiers=identifiers, timeout=int(opts.timeout)) if not results: print (log, file=sys.stderr) prints('No results found', file=sys.stderr) raise SystemExit(1) result = results[0] cf = None if opts.cover and results: cover = download_cover(log, title=opts.title, authors=authors, identifiers=result.identifiers, timeout=int(opts.timeout)) if cover is None: prints('No cover found', file=sys.stderr) else: save_cover_data_to(cover[-1], opts.cover) result.cover = cf = opts.cover log = buf.getvalue() result = (metadata_to_opf(result) if opts.opf else unicode(result).encode('utf-8')) if opts.verbose: print (log, file=sys.stderr) print (result) if not opts.opf and opts.cover: prints('Cover :', cf) return 0
def do_set_metadata(opts, mi, stream, stream_type): mi = MetaInformation(mi) for x in ('guide', 'toc', 'manifest', 'spine'): setattr(mi, x, None) from_opf = getattr(opts, 'from_opf', None) if from_opf is not None: from calibre.ebooks.metadata.opf2 import OPF opf_mi = OPF(open(from_opf, 'rb')).to_book_metadata() mi.smart_update(opf_mi) for pref in config().option_set.preferences: if pref.name in ('to_opf', 'from_opf', 'authors', 'title_sort', 'author_sort', 'get_cover', 'cover', 'tags', 'lrf_bookid', 'identifiers'): continue val = getattr(opts, pref.name, None) if val is not None: setattr(mi, pref.name, val) if getattr(opts, 'authors', None) is not None: mi.authors = string_to_authors(opts.authors) mi.author_sort = authors_to_sort_string(mi.authors) if getattr(opts, 'author_sort', None) is not None: mi.author_sort = opts.author_sort if getattr(opts, 'title_sort', None) is not None: mi.title_sort = opts.title_sort elif getattr(opts, 'title', None) is not None: mi.title_sort = title_sort(opts.title) if getattr(opts, 'tags', None) is not None: mi.tags = [t.strip() for t in opts.tags.split(',')] if getattr(opts, 'series', None) is not None: mi.series = opts.series.strip() if getattr(opts, 'series_index', None) is not None: mi.series_index = float(opts.series_index.strip()) if getattr(opts, 'pubdate', None) is not None: mi.pubdate = parse_date(opts.pubdate, assume_utc=False, as_utc=False) if getattr(opts, 'identifiers', None): val = {k.strip():v.strip() for k, v in (x.partition(':')[0::2] for x in opts.identifiers)} if val: orig = mi.get_identifiers() orig.update(val) val = {k:v for k, v in iteritems(orig) if k and v} mi.set_identifiers(val) if getattr(opts, 'cover', None) is not None: ext = os.path.splitext(opts.cover)[1].replace('.', '').upper() mi.cover_data = (ext, open(opts.cover, 'rb').read()) with force_identifiers: set_metadata(stream, mi, stream_type)
def get_metadata(f): read = lambda at, amount: _read(f, at, amount) f.seek(0) buf = f.read(12) if buf[4:] == 'ftypLRX2': offset = 0 while True: offset += word_be(buf[:4]) try: buf = read(offset, 8) except: raise ValueError('Not a valid LRX file') if buf[4:] == 'bbeb': break offset += 8 buf = read(offset, 16) if buf[:8].decode('utf-16-le') != 'LRF\x00': raise ValueError('Not a valid LRX file') lrf_version = word_le(buf[8:12]) offset += 0x4c compressed_size = short_le(read(offset, 2)) offset += 2 if lrf_version >= 800: offset += 6 compressed_size -= 4 uncompressed_size = word_le(read(offset, 4)) info = decompress(f.read(compressed_size)) if len(info) != uncompressed_size: raise ValueError('LRX file has malformed metadata section') root = etree.fromstring(info) bi = root.find('BookInfo') title = bi.find('Title') title_sort = title.get('reading', None) title = title.text author = bi.find('Author') author_sort = author.get('reading', None) mi = MetaInformation(title, string_to_authors(author.text)) mi.title_sort, mi.author_sort = title_sort, author_sort author = author.text publisher = bi.find('Publisher') mi.publisher = getattr(publisher, 'text', None) mi.tags = [x.text for x in bi.findall('Category')] mi.language = root.find('DocInfo').find('Language').text return mi elif buf[4:8] == 'LRX': raise ValueError('Librie LRX format not supported') else: raise ValueError('Not a LRX file')
def get_metadata(f): read = lambda at, amount: _read(f, at, amount) f.seek(0) buf = f.read(12) if buf[4:] == b'ftypLRX2': offset = 0 while True: offset += word_be(buf[:4]) try: buf = read(offset, 8) except: raise ValueError('Not a valid LRX file') if buf[4:] == b'bbeb': break offset += 8 buf = read(offset, 16) if buf[:8].decode('utf-16-le') != 'LRF\x00': raise ValueError('Not a valid LRX file') lrf_version = word_le(buf[8:12]) offset += 0x4c compressed_size = short_le(read(offset, 2)) offset += 2 if lrf_version >= 800: offset += 6 compressed_size -= 4 uncompressed_size = word_le(read(offset, 4)) info = decompress(f.read(compressed_size)) if len(info) != uncompressed_size: raise ValueError('LRX file has malformed metadata section') root = safe_xml_fromstring(info) bi = root.find('BookInfo') title = bi.find('Title') title_sort = title.get('reading', None) title = title.text author = bi.find('Author') author_sort = author.get('reading', None) mi = MetaInformation(title, string_to_authors(author.text)) mi.title_sort, mi.author_sort = title_sort, author_sort author = author.text publisher = bi.find('Publisher') mi.publisher = getattr(publisher, 'text', None) mi.tags = [x.text for x in bi.findall('Category')] mi.language = root.find('DocInfo').find('Language').text return mi elif buf[4:8] == b'LRX': raise ValueError('Librie LRX format not supported') else: raise ValueError('Not a LRX file')
def get_metadata(stream): zin = zipfile.ZipFile(stream, 'r') odfs = odfmetaparser() parser = xml.sax.make_parser() parser.setFeature(xml.sax.handler.feature_namespaces, 1) parser.setContentHandler(odfs) content = zin.read('meta.xml') parser.parse(StringIO(content)) data = odfs.seenfields mi = MetaInformation(None, []) if data.has_key('title'): mi.title = data['title'] if data.get('initial-creator', '').strip(): mi.authors = string_to_authors(data['initial-creator']) elif data.has_key('creator'): mi.authors = string_to_authors(data['creator']) if data.has_key('description'): mi.comments = data['description'] if data.has_key('language'): mi.language = data['language'] if data.get('keywords', ''): mi.tags = data['keywords'].split(',') return mi
def add_author(self): text = self.author.text().strip() authors = OrderedDict((icu_lower(x), (i, x)) for i, x in enumerate(self.authors)) if text: for author in string_to_authors(text): la = icu_lower(author) if la in authors and authors[la][1] != author: # Case change i = authors[la][0] authors[la] = (i, author) self.al.item(i).setText(author) else: self.al.addItem(author) authors[la] = author self.author.setText('')
def update_state(self, *args): au = unicode(self.authors_edit.text()) au = re.sub(r'\s+et al\.$', '', au) au = self.db.author_sort_from_authors(string_to_authors(au)) normal = strcmp(au, self.current_val) == 0 if normal: col = 'rgb(0, 255, 0, 20%)' else: col = 'rgb(255, 0, 0, 20%)' self.setStyleSheet('QLineEdit { color: black; ' 'background-color: %s; }'%col) tt = self.tooltips[0 if normal else 1] self.setToolTip(tt) self.setWhatsThis(tt)
def update_state(self, *args): au = unicode(self.authors_edit.text()) au = re.sub(r'\s+et al\.$', '', au) au = self.db.author_sort_from_authors(string_to_authors(au)) normal = strcmp(au, self.current_val) == 0 if normal: col = 'rgb(0, 255, 0, 20%)' else: col = 'rgb(255, 0, 0, 20%)' self.setStyleSheet('QLineEdit { color: black; ' 'background-color: %s; }' % col) tt = self.tooltips[0 if normal else 1] self.setToolTip(tt) self.setWhatsThis(tt)
def cell_changed(self, row, col): if col == 0: item = self.table.item(row, 0) aut = unicode(item.text()).strip() aut_list = string_to_authors(aut) if len(aut_list) != 1: error_dialog(self.parent(), _('Invalid author name'), _('You cannot change an author to multiple authors.')).exec_() aut = ' % '.join(aut_list) self.table.item(row, 0).setText(aut) c = self.table.item(row, 1) c.setText(author_to_author_sort(aut)) item = c else: item = self.table.item(row, col) self.table.setCurrentItem(item) self.table.scrollToItem(item)
def cell_changed(self, row, col): if col == 0: item = self.table.item(row, 0) aut = unicode_type(item.text()).strip() aut_list = string_to_authors(aut) if len(aut_list) != 1: error_dialog(self.parent(), _('Invalid author name'), _('You cannot change an author to multiple authors.')).exec_() aut = ' % '.join(aut_list) self.table.item(row, 0).setText(aut) c = self.table.item(row, 1) c.setText(author_to_author_sort(aut)) item = c else: item = self.table.item(row, col) self.table.setCurrentItem(item) self.table.scrollToItem(item)
def main(opts, args, dbctx): aut = string_to_authors(opts.authors) if opts.authors else [] tags = [x.strip() for x in opts.tags.split(',')] if opts.tags else [] lcodes = [canonicalize_lang(x) for x in (opts.languages or '').split(',')] lcodes = [x for x in lcodes if x] identifiers = (x.partition(':')[::2] for x in opts.identifier) identifiers = dict((k.strip(), v.strip()) for k, v in identifiers if k.strip() and v.strip()) if opts.empty: do_add_empty(dbctx, opts.title, aut, opts.isbn, tags, opts.series, opts.series_index, opts.cover, identifiers, lcodes) return 0 if len(args) < 1: raise SystemExit(_('You must specify at least one file to add')) do_add(dbctx, args, opts.one_book_per_directory, opts.recurse, opts.duplicates, opts.title, aut, opts.isbn, tags, opts.series, opts.series_index, opts.cover, identifiers, lcodes, opts.filters) return 0
def get_metadata(stream): """ Return basic meta-data about the LRF file in C{stream} as a L{MetaInformation} object. @param stream: A file like object or an instance of L{LRFMetaFile} """ lrf = stream if isinstance(stream, LRFMetaFile) else LRFMetaFile(stream) authors = string_to_authors(lrf.author) mi = MetaInformation(lrf.title.strip(), authors) mi.author = lrf.author.strip() mi.comments = lrf.free_text.strip() mi.category = lrf.category.strip()+', '+lrf.classification.strip() tags = [x.strip() for x in mi.category.split(',') if x.strip()] if tags: mi.tags = tags if mi.category.strip() == ',': mi.category = None mi.publisher = lrf.publisher.strip() mi.cover_data = lrf.get_cover() try: mi.title_sort = lrf.title_reading.strip() if not mi.title_sort: mi.title_sort = None except: pass try: mi.author_sort = lrf.author_reading.strip() if not mi.author_sort: mi.author_sort = None except: pass if not mi.title or 'unknown' in mi.title.lower(): mi.title = None if not mi.authors: mi.authors = None if not mi.author or 'unknown' in mi.author.lower(): mi.author = None if not mi.category or 'unknown' in mi.category.lower(): mi.category = None if not mi.publisher or 'unknown' in mi.publisher.lower() or \ 'some publisher' in mi.publisher.lower(): mi.publisher = None return mi
def get_metadata(stream): """ Return basic meta-data about the LRF file in C{stream} as a L{MetaInformation} object. @param stream: A file like object or an instance of L{LRFMetaFile} """ lrf = stream if isinstance(stream, LRFMetaFile) else LRFMetaFile(stream) authors = string_to_authors(lrf.author) mi = MetaInformation(lrf.title.strip(), authors) mi.author = lrf.author.strip() mi.comments = lrf.free_text.strip() mi.category = lrf.category.strip() + ', ' + lrf.classification.strip() tags = [x.strip() for x in mi.category.split(',') if x.strip()] if tags: mi.tags = tags if mi.category.strip() == ',': mi.category = None mi.publisher = lrf.publisher.strip() mi.cover_data = lrf.get_cover() try: mi.title_sort = lrf.title_reading.strip() if not mi.title_sort: mi.title_sort = None except: pass try: mi.author_sort = lrf.author_reading.strip() if not mi.author_sort: mi.author_sort = None except: pass if not mi.title or 'unknown' in mi.title.lower(): mi.title = None if not mi.authors: mi.authors = None if not mi.author or 'unknown' in mi.author.lower(): mi.author = None if not mi.category or 'unknown' in mi.category.lower(): mi.category = None if not mi.publisher or 'unknown' in mi.publisher.lower() or \ 'some publisher' in mi.publisher.lower(): mi.publisher = None return mi
def metadata_from_path(cls, path): def check_unicode(txt): if not isinstance(txt, unicode_type): txt = txt.decode(filesystem_encoding, 'replace') txt = txt.replace('_', ' ') return txt mi = cls.metadata_from_formats([path]) if (mi.title==_('Unknown') or mi.authors==[_('Unknown')]) \ and '#' in mi.title: fn = os.path.splitext(os.path.basename(path))[0] match = cls.JETBOOK_FILE_NAME_PATTERN.match(fn) if match is not None: mi.title = check_unicode(match.group('title')) authors = string_to_authors(match.group('authors')) mi.authors = list(map(check_unicode, authors)) return mi
def get_metadata(stream): """ Return metadata as a L{MetaInfo} object """ stream.seek(0) if stream.read(5) != r'{\rtf': return MetaInformation(_('Unknown')) block = get_document_info(stream)[0] if not block: return MetaInformation(_('Unknown')) stream.seek(0) cpg = detect_codepage(stream) stream.seek(0) title_match = title_pat.search(block) if title_match is not None: title = decode(title_match.group(1).strip(), cpg) else: title = _('Unknown') author_match = author_pat.search(block) if author_match is not None: author = decode(author_match.group(1).strip(), cpg) else: author = None mi = MetaInformation(title) if author: mi.authors = string_to_authors(author) comment_match = comment_pat.search(block) if comment_match is not None: comment = decode(comment_match.group(1).strip(), cpg) mi.comments = comment tags_match = tags_pat.search(block) if tags_match is not None: tags = decode(tags_match.group(1).strip(), cpg) mi.tags = list([_f for _f in (x.strip() for x in tags.split(',')) if _f]) publisher_match = publisher_pat.search(block) if publisher_match is not None: publisher = decode(publisher_match.group(1).strip(), cpg) mi.publisher = publisher return mi
def command_add(args, dbpath): from calibre.ebooks.metadata import string_to_authors parser = add_option_parser() opts, args = parser.parse_args(sys.argv[:1] + args) aut = string_to_authors(opts.authors) if opts.authors else [] tags = [x.strip() for x in opts.tags.split(',')] if opts.tags else [] if opts.empty: do_add_empty(get_db(dbpath, opts), opts.title, aut, opts.isbn, tags, opts.series, opts.series_index) return 0 if len(args) < 2: parser.print_help() print print >>sys.stderr, _('You must specify at least one file to add') return 1 do_add(get_db(dbpath, opts), args[1:], opts.one_book_per_directory, opts.recurse, opts.duplicates, opts.title, aut, opts.isbn, tags, opts.series, opts.series_index) return 0
def get_metadata(stream): """ Return metadata as a L{MetaInfo} object """ stream.seek(0) if stream.read(5) != r'{\rtf': return MetaInformation(_('Unknown')) block = get_document_info(stream)[0] if not block: return MetaInformation(_('Unknown')) stream.seek(0) cpg = detect_codepage(stream) stream.seek(0) title_match = title_pat.search(block) if title_match is not None: title = decode(title_match.group(1).strip(), cpg) else: title = _('Unknown') author_match = author_pat.search(block) if author_match is not None: author = decode(author_match.group(1).strip(), cpg) else: author = None mi = MetaInformation(title) if author: mi.authors = string_to_authors(author) comment_match = comment_pat.search(block) if comment_match is not None: comment = decode(comment_match.group(1).strip(), cpg) mi.comments = comment tags_match = tags_pat.search(block) if tags_match is not None: tags = decode(tags_match.group(1).strip(), cpg) mi.tags = list(filter(None, (x.strip() for x in tags.split(',')))) publisher_match = publisher_pat.search(block) if publisher_match is not None: publisher = decode(publisher_match.group(1).strip(), cpg) mi.publisher = publisher return mi
def get_metadata(stream): """ Return metadata as a L{MetaInfo} object """ title = 'Unknown' mi = MetaInformation(title, ['Unknown']) stream.seek(0) try: if not stream.read(14) == MAGIC: print >> sys.stderr, u'Couldn\'t read RB header from file' return mi stream.read(10) read_i32 = lambda: struct.unpack('<I', stream.read(4))[0] stream.seek(read_i32()) toc_count = read_i32() for i in range(toc_count): stream.read(32) length, offset, flag = read_i32(), read_i32(), read_i32() if flag == 2: break else: print >> sys.stderr, u'Couldn\'t find INFO from RB file' return mi stream.seek(offset) info = stream.read(length).splitlines() for line in info: if '=' not in line: continue key, value = line.split('=') if key.strip() == 'TITLE': mi.title = value.strip() elif key.strip() == 'AUTHOR': mi.author = value mi.authors = string_to_authors(value) except Exception as err: msg = u'Couldn\'t read metadata from rb: %s with error %s' % ( mi.title, unicode(err)) print >> sys.stderr, msg.encode('utf8') raise return mi
def get_metadata(stream): """ Return metadata as a L{MetaInfo} object """ title = 'Unknown' mi = MetaInformation(title, ['Unknown']) stream.seek(0) try: if not stream.read(14) == MAGIC: print(u'Couldn\'t read RB header from file', file=sys.stderr) return mi stream.read(10) read_i32 = lambda: struct.unpack('<I', stream.read(4))[0] stream.seek(read_i32()) toc_count = read_i32() for i in range(toc_count): stream.read(32) length, offset, flag = read_i32(), read_i32(), read_i32() if flag == 2: break else: print(u'Couldn\'t find INFO from RB file', file=sys.stderr) return mi stream.seek(offset) info = stream.read(length).splitlines() for line in info: if '=' not in line: continue key, value = line.split('=') if key.strip() == 'TITLE': mi.title = value.strip() elif key.strip() == 'AUTHOR': mi.author = value mi.authors = string_to_authors(value) except Exception as err: msg = u'Couldn\'t read metadata from rb: %s with error %s'%(mi.title, unicode(err)) print(msg.encode('utf8'), file=sys.stderr) raise return mi
def read_doc_props(raw, mi, XPath): root = fromstring(raw) titles = XPath('//dc:title')(root) if titles: title = titles[0].text if title and title.strip(): mi.title = title.strip() tags = [] for subject in XPath('//dc:subject')(root): if subject.text and subject.text.strip(): tags.append(subject.text.strip().replace(',', '_')) for keywords in XPath('//cp:keywords')(root): if keywords.text and keywords.text.strip(): for x in keywords.text.split(): tags.extend(y.strip() for y in x.split(',') if y.strip()) if tags: mi.tags = tags authors = XPath('//dc:creator')(root) aut = [] for author in authors: if author.text and author.text.strip(): aut.extend(string_to_authors(author.text)) if aut: mi.authors = aut mi.author_sort = authors_to_sort_string(aut) desc = XPath('//dc:description')(root) if desc: raw = etree.tostring(desc[0], method='text', encoding=unicode_type) raw = raw.replace('_x000d_', '') # Word 2007 mangles newlines in the summary mi.comments = raw.strip() langs = [] for lang in XPath('//dc:language')(root): if lang.text and lang.text.strip(): l = canonicalize_lang(lang.text) if l: langs.append(l) if langs: mi.languages = langs
def get_metadata_quick(raw): p = podofo.PDFDoc() p.load(raw) title = p.title if not title: title = '_' author = p.author authors = string_to_authors(author) if author else [_('Unknown')] creator = p.creator try: tags = [x.strip() for x in p.keywords.split(u',')] tags = [x for x in tags if x] except: tags = [] mi = MetaInformation(title, authors) if creator: mi.book_producer = creator if tags: mi.tags = tags return mi
def read_doc_props(raw, mi): root = fromstring(raw) titles = XPath('//dc:title')(root) if titles: title = titles[0].text if title and title.strip(): mi.title = title.strip() tags = [] for subject in XPath('//dc:subject')(root): if subject.text and subject.text.strip(): tags.append(subject.text.strip().replace(',', '_')) for keywords in XPath('//cp:keywords')(root): if keywords.text and keywords.text.strip(): for x in keywords.text.split(): tags.extend(y.strip() for y in x.split(',') if y.strip()) if tags: mi.tags = tags authors = XPath('//dc:creator')(root) aut = [] for author in authors: if author.text and author.text.strip(): aut.extend(string_to_authors(author.text)) if aut: mi.authors = aut mi.author_sort = authors_to_sort_string(aut) desc = XPath('//dc:description')(root) if desc: raw = etree.tostring(desc[0], method='text', encoding=unicode) raw = raw.replace('_x000d_', '') # Word 2007 mangles newlines in the summary mi.comments = raw.strip() langs = [] for lang in XPath('//dc:language')(root): if lang.text and lang.text.strip(): l = canonicalize_lang(lang.text) if l: langs.append(l) if langs: mi.languages = langs
def main(opts, args, dbctx): aut = string_to_authors(opts.authors) if opts.authors else [] tags = [x.strip() for x in opts.tags.split(',')] if opts.tags else [] lcodes = [canonicalize_lang(x) for x in (opts.languages or '').split(',')] lcodes = [x for x in lcodes if x] identifiers = (x.partition(':')[::2] for x in opts.identifier) identifiers = dict((k.strip(), v.strip()) for k, v in identifiers if k.strip() and v.strip()) if opts.empty: do_add_empty( dbctx, opts.title, aut, opts.isbn, tags, opts.series, opts.series_index, opts.cover, identifiers, lcodes ) return 0 if len(args) < 1: raise SystemExit(_('You must specify at least one file to add')) do_add( dbctx, args, opts.one_book_per_directory, opts.recurse, opts.duplicates, opts.title, aut, opts.isbn, tags, opts.series, opts.series_index, opts.cover, identifiers, lcodes, opts.filters ) return 0
def opts_to_mi(self, mi): from calibre.ebooks.metadata import string_to_authors for x in self.metadata_option_names: val = getattr(self.opts, x, None) if val is not None: if x == 'authors': val = string_to_authors(val) elif x == 'tags': val = [i.strip() for i in val.split(',')] elif x in ('rating', 'series_index'): try: val = float(val) except ValueError: self.log.warn(_('Values of series index and rating must' ' be numbers. Ignoring'), val) continue elif x in ('timestamp', 'pubdate'): try: val = parse_date(val, assume_utc=x=='pubdate') except: self.log.exception(_('Failed to parse date/time') + ' ' + unicode(val)) continue setattr(mi, x, val)
def metadata_from_filename(name, pat=None, fallback_pat=None): if isbytestring(name): name = name.decode(filesystem_encoding, 'replace') name = name.rpartition('.')[0] mi = MetaInformation(None, None) if pat is None: pat = re.compile(prefs.get('filename_pattern')) name = name.replace('_', ' ') match = pat.search(name) if match is None and fallback_pat is not None: match = fallback_pat.search(name) if match is not None: try: mi.title = match.group('title') except IndexError: pass try: au = match.group('author') aus = string_to_authors(au) if aus: mi.authors = aus if prefs['swap_author_names'] and mi.authors: def swap(a): if ',' in a: parts = a.split(',', 1) else: parts = a.split(None, 1) if len(parts) > 1: t = parts[-1] parts = parts[:-1] parts.insert(0, t) return ' '.join(parts) mi.authors = [swap(x) for x in mi.authors] except (IndexError, ValueError): pass try: mi.series = match.group('series') except IndexError: pass try: si = match.group('series_index') mi.series_index = float(si) except (IndexError, ValueError, TypeError): pass try: si = match.group('isbn') mi.isbn = si except (IndexError, ValueError): pass try: publisher = match.group('publisher') mi.publisher = publisher except (IndexError, ValueError): pass try: pubdate = match.group('published') if pubdate: from calibre.utils.date import parse_only_date mi.pubdate = parse_only_date(pubdate) except: pass try: comments = match.group('comments') mi.comments = comments except (IndexError, ValueError): pass if mi.is_null('title'): mi.title = name return mi
def create_oebbook(self, htmlpath, basedir, opts, log, mi): import uuid from calibre.ebooks.conversion.plumber import create_oebbook from calibre.ebooks.oeb.base import (DirContainer, rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES, xpath, urlquote) from calibre import guess_type from calibre.ebooks.oeb.transforms.metadata import \ meta_info_to_oeb_metadata from calibre.ebooks.html.input import get_filelist from calibre.ebooks.metadata import string_to_authors from calibre.utils.localization import canonicalize_lang import css_parser, logging css_parser.log.setLevel(logging.WARN) self.OEB_STYLES = OEB_STYLES oeb = create_oebbook(log, None, opts, self, encoding=opts.input_encoding, populate=False) self.oeb = oeb metadata = oeb.metadata meta_info_to_oeb_metadata(mi, metadata, log) if not metadata.language: l = canonicalize_lang(getattr(opts, 'language', None)) if not l: oeb.logger.warn('Language not specified') l = get_lang().replace('_', '-') metadata.add('language', l) if not metadata.creator: a = getattr(opts, 'authors', None) if a: a = string_to_authors(a) if not a: oeb.logger.warn('Creator not specified') a = [self.oeb.translate(__('Unknown'))] for aut in a: metadata.add('creator', aut) if not metadata.title: oeb.logger.warn('Title not specified') metadata.add('title', self.oeb.translate(__('Unknown'))) bookid = unicode_type(uuid.uuid4()) metadata.add('identifier', bookid, id='uuid_id', scheme='uuid') for ident in metadata.identifier: if 'id' in ident.attrib: self.oeb.uid = metadata.identifier[0] break filelist = get_filelist(htmlpath, basedir, opts, log) filelist = [f for f in filelist if not f.is_binary] htmlfile_map = {} for f in filelist: path = f.path oeb.container = DirContainer(os.path.dirname(path), log, ignore_opf=True) bname = os.path.basename(path) id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname)) htmlfile_map[path] = href item = oeb.manifest.add(id, href, 'text/html') if path == htmlpath and '%' in path: bname = urlquote(bname) item.html_input_href = bname oeb.spine.add(item, True) self.added_resources = {} self.log = log self.log('Normalizing filename cases') for path, href in htmlfile_map.items(): if not self.is_case_sensitive(path): path = path.lower() self.added_resources[path] = href self.urlnormalize, self.DirContainer = urlnormalize, DirContainer self.urldefrag = urldefrag self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME self.log('Rewriting HTML links') for f in filelist: path = f.path dpath = os.path.dirname(path) oeb.container = DirContainer(dpath, log, ignore_opf=True) href = htmlfile_map[path] try: item = oeb.manifest.hrefs[href] except KeyError: item = oeb.manifest.hrefs[urlnormalize(href)] rewrite_links(item.data, partial(self.resource_adder, base=dpath)) for item in oeb.manifest.values(): if item.media_type in self.OEB_STYLES: dpath = None for path, href in self.added_resources.items(): if href == item.href: dpath = os.path.dirname(path) break css_parser.replaceUrls( item.data, partial(self.resource_adder, base=dpath)) toc = self.oeb.toc self.oeb.auto_generated_toc = True titles = [] headers = [] for item in self.oeb.spine: if not item.linear: continue html = item.data title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) title = re.sub(r'\s+', ' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' header = ''.join(xpath(html, expr % tag)) header = re.sub(r'\s+', ' ', header.strip()) if header: headers[-1] = header break use = titles if len(titles) > len(set(titles)): use = headers for title, item in zip(use, self.oeb.spine): if not item.linear: continue toc.add(title, item.href) oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True) return oeb
def authors_from_string(self, raw): from calibre.ebooks.metadata import string_to_authors self.authors = string_to_authors(raw)
def auto_generate(self, *args): au = unicode(self.authors_edit.text()) au = re.sub(r'\s+et al\.$', '', au).strip() authors = string_to_authors(au) self.current_val = self.db.author_sort_from_authors(authors)
def fget(self): au = unicode(self.text()).strip() if not au: au = self.get_default() return string_to_authors(au)
def deduce_author_sort(self, *args): au = str(self.author.currentText()) au = re.sub(r'\s+et al\.$', '', au) authors = string_to_authors(au) self.author_sort.setText(self.db.author_sort_from_authors(authors))
def get_metadata_(src, encoding=None): # Meta data definitions as in # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9 if isbytestring(src): if not encoding: src = xml_to_unicode(src)[0] else: src = src.decode(encoding, 'replace') src = src[:150000] # Searching shouldn't take too long comment_tags = parse_comment_tags(src) meta_tags = parse_meta_tags(src) def get(field): ans = comment_tags.get(field, meta_tags.get(field, None)) if ans: ans = ans.strip() if not ans: ans = None return ans # Title title = get('title') if not title: pat = re.compile('<title>([^<>]+?)</title>', re.IGNORECASE) match = pat.search(src) if match: title = replace_entities(match.group(1)) # Author authors = get('authors') or _('Unknown') # Create MetaInformation with Title and Author mi = Metadata(title or _('Unknown'), string_to_authors(authors)) for field in ('publisher', 'isbn', 'language', 'comments'): val = get(field) if val: setattr(mi, field, val) for field in ('pubdate', 'timestamp'): try: val = parse_date(get(field)) except: pass else: if not is_date_undefined(val): setattr(mi, field, val) # SERIES series = get('series') if series: pat = re.compile(r'\[([.0-9]+)\]$') match = pat.search(series) series_index = None if match is not None: try: series_index = float(match.group(1)) except: pass series = series.replace(match.group(), '').strip() mi.series = series if series_index is None: series_index = get('series_index') try: series_index = float(series_index) except: pass if series_index is not None: mi.series_index = series_index # RATING rating = get('rating') if rating: try: mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 if mi.rating > 5: mi.rating /= 2. if mi.rating > 5: mi.rating = 0 except: pass # TAGS tags = get('tags') if tags: tags = [x.strip() for x in tags.split(',') if x.strip()] if tags: mi.tags = tags return mi