def do_one_isbn_add(self): try: db = self.gui.library_view.model().db try: x = self.isbn_books.pop(0) except IndexError: self.gui.library_view.model().books_added(self.isbn_add_dialog.value) self.isbn_add_dialog.accept() self.gui.iactions['Edit Metadata'].download_metadata( ids=self.add_by_isbn_ids, ensure_fields=frozenset(['title', 'authors'])) return mi = MetaInformation(None) mi.isbn = x['isbn'] if self.isbn_add_tags: mi.tags = list(self.isbn_add_tags) fmts = [] if x['path'] is None else [x['path']] self.add_by_isbn_ids.add(db.import_book(mi, fmts)) self.isbn_add_dialog.value += 1 QTimer.singleShot(10, self.do_one_isbn_add) except: self.isbn_add_dialog.accept() raise
def setUp(self): self.tdir = PersistentTemporaryDirectory('_calibre_dbtest') self.db = LibraryDatabase2(self.tdir) f = open(os.path.join(self.tdir, 'test.txt'), 'w+b') f.write('test') paths = list(repeat(f, 3)) formats = list(repeat('txt', 3)) m1 = MetaInformation('Test Ebook 1', ['Test Author 1']) m1.tags = ['tag1', 'tag2'] m1.publisher = 'Test Publisher 1' m1.rating = 2 m1.series = 'Test Series 1' m1.series_index = 3 m1.author_sort = 'as1' m1.isbn = 'isbn1' m1.cover_data = ('jpg', self.img) m2 = MetaInformation('Test Ebook 2', ['Test Author 2']) m2.tags = ['tag3', 'tag4'] m2.publisher = 'Test Publisher 2' m2.rating = 3 m2.series = 'Test Series 2' m2.series_index = 1 m2.author_sort = 'as1' m2.isbn = 'isbn1' self.db.add_books(paths, formats, [m1, m2, m2], add_duplicates=True) self.m1, self.m2 = m1, m2
def get_metadata(stream, extract_cover=True): """ Return metadata as a L{MetaInfo} object """ mi = MetaInformation(None, [_('Unknown')]) stream.seek(0) pheader = PdbHeaderReader(stream) # Only Dropbook produced 132 byte record0 files are supported if len(pheader.section_data(0)) == 132: hr = HeaderRecord(pheader.section_data(0)) if hr.compression in (2, 10) and hr.has_metadata == 1: try: mdata = pheader.section_data(hr.metadata_offset) mdata = mdata.decode('cp1252', 'replace').split('\x00') mi.title = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[0]) mi.authors = [re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[1])] mi.publisher = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[3]) mi.isbn = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[4]) except Exception: pass if extract_cover: mi.cover_data = get_cover(pheader, hr) if not mi.title: mi.title = pheader.title if pheader.title else _('Unknown') return mi
def get_metadata(stream, extract_cover=True): zin = zipfile.ZipFile(stream, 'r') odfs = odfmetaparser() parser = xml.sax.make_parser() parser.setFeature(xml.sax.handler.feature_namespaces, 1) parser.setContentHandler(odfs) content = zin.read('meta.xml') parser.parse(StringIO(content)) data = odfs.seenfields mi = MetaInformation(None, []) if data.has_key('title'): mi.title = data['title'] if data.get('initial-creator', '').strip(): mi.authors = string_to_authors(data['initial-creator']) elif data.has_key('creator'): mi.authors = string_to_authors(data['creator']) if data.has_key('description'): mi.comments = data['description'] if data.has_key('language'): mi.language = data['language'] if data.get('keywords', ''): mi.tags = [x.strip() for x in data['keywords'].split(',') if x.strip()] opfmeta = False # we need this later for the cover opfnocover = False if data.get('opf.metadata','') == 'true': # custom metadata contains OPF information opfmeta = True if data.get('opf.titlesort', ''): mi.title_sort = data['opf.titlesort'] if data.get('opf.authors', ''): mi.authors = string_to_authors(data['opf.authors']) if data.get('opf.authorsort', ''): mi.author_sort = data['opf.authorsort'] if data.get('opf.isbn', ''): isbn = check_isbn(data['opf.isbn']) if isbn is not None: mi.isbn = isbn if data.get('opf.publisher', ''): mi.publisher = data['opf.publisher'] if data.get('opf.pubdate', ''): mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True) if data.get('opf.series', ''): mi.series = data['opf.series'] if data.get('opf.seriesindex', ''): try: mi.series_index = float(data['opf.seriesindex']) except ValueError: mi.series_index = 1.0 if data.get('opf.language', ''): cl = canonicalize_lang(data['opf.language']) if cl: mi.languages = [cl] opfnocover = data.get('opf.nocover', 'false') == 'true' if not opfnocover: try: read_cover(stream, zin, mi, opfmeta, extract_cover) except: pass # Do not let an error reading the cover prevent reading other data return mi
def get_metadata_from_reader(rdr): raw = rdr.GetFile(rdr.home) home = BeautifulSoup( xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]) title = rdr.title try: x = rdr.GetEncoding() codecs.lookup(x) enc = x except: enc = 'cp1252' title = force_unicode(title, enc) authors = _get_authors(home) mi = MetaInformation(title, authors) publisher = _get_publisher(home) if publisher: mi.publisher = publisher isbn = _get_isbn(home) if isbn: mi.isbn = isbn comments = _get_comments(home) if comments: mi.comments = comments cdata = _get_cover(home, rdr) if cdata is not None: mi.cover_data = ('jpg', cdata) return mi
def get_metadata(stream, extract_cover=True): """ Return metadata as a L{MetaInfo} object """ mi = MetaInformation(None, [_('Unknown')]) stream.seek(0) pheader = PdbHeaderReader(stream) # Only Dropbook produced 132 byte record0 files are supported if len(pheader.section_data(0)) == 132: hr = HeaderRecord(pheader.section_data(0)) if hr.compression in (2, 10) and hr.has_metadata == 1: try: mdata = pheader.section_data(hr.metadata_offset) mdata = mdata.split('\x00') mi.title = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[0]) mi.authors = [re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[1])] mi.publisher = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[3]) mi.isbn = re.sub(r'[^a-zA-Z0-9 \._=\+\-!\?,\'\"]', '', mdata[4]) except: pass if extract_cover: mi.cover_data = get_cover(pheader, hr) if not mi.title: mi.title = pheader.title if pheader.title else _('Unknown') return mi
def get_metadata(stream, extract_cover=True): zin = zipfile.ZipFile(stream, 'r') odfs = odfmetaparser() parser = xml.sax.make_parser() parser.setFeature(xml.sax.handler.feature_namespaces, 1) parser.setContentHandler(odfs) content = zin.read('meta.xml') parser.parse(StringIO(content)) data = odfs.seenfields mi = MetaInformation(None, []) if 'title' in data: mi.title = data['title'] if data.get('initial-creator', '').strip(): mi.authors = string_to_authors(data['initial-creator']) elif 'creator' in data: mi.authors = string_to_authors(data['creator']) if 'description' in data: mi.comments = data['description'] if 'language' in data: mi.language = data['language'] if data.get('keywords', ''): mi.tags = [x.strip() for x in data['keywords'].split(',') if x.strip()] opfmeta = False # we need this later for the cover opfnocover = False if data.get('opf.metadata','') == 'true': # custom metadata contains OPF information opfmeta = True if data.get('opf.titlesort', ''): mi.title_sort = data['opf.titlesort'] if data.get('opf.authors', ''): mi.authors = string_to_authors(data['opf.authors']) if data.get('opf.authorsort', ''): mi.author_sort = data['opf.authorsort'] if data.get('opf.isbn', ''): isbn = check_isbn(data['opf.isbn']) if isbn is not None: mi.isbn = isbn if data.get('opf.publisher', ''): mi.publisher = data['opf.publisher'] if data.get('opf.pubdate', ''): mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True) if data.get('opf.series', ''): mi.series = data['opf.series'] if data.get('opf.seriesindex', ''): try: mi.series_index = float(data['opf.seriesindex']) except ValueError: mi.series_index = 1.0 if data.get('opf.language', ''): cl = canonicalize_lang(data['opf.language']) if cl: mi.languages = [cl] opfnocover = data.get('opf.nocover', 'false') == 'true' if not opfnocover: try: read_cover(stream, zin, mi, opfmeta, extract_cover) except: pass # Do not let an error reading the cover prevent reading other data return mi
def do_one_isbn_add(self): try: db = self.gui.library_view.model().db try: x = self.isbn_books.pop(0) except IndexError: self.gui.library_view.model().books_added( self.isbn_add_dialog.value) self.isbn_add_dialog.accept() self.gui.iactions['Edit Metadata'].download_metadata( ids=self.add_by_isbn_ids, ensure_fields=frozenset(['title', 'authors'])) return mi = MetaInformation(None) mi.isbn = x['isbn'] if self.isbn_add_tags: mi.tags = list(self.isbn_add_tags) fmts = [] if x['path'] is None else [x['path']] self.add_by_isbn_ids.add(db.import_book(mi, fmts)) self.isbn_add_dialog.value += 1 QTimer.singleShot(10, self.do_one_isbn_add) except: self.isbn_add_dialog.accept() raise
def get_metadata_from_reader(rdr): raw = rdr.GetFile(rdr.home) home = BeautifulSoup(xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]) title = rdr.title try: x = rdr.GetEncoding() codecs.lookup(x) enc = x except: enc = 'cp1252' title = force_unicode(title, enc) authors = _get_authors(home) mi = MetaInformation(title, authors) publisher = _get_publisher(home) if publisher: mi.publisher = publisher isbn = _get_isbn(home) if isbn: mi.isbn = isbn comments = _get_comments(home) if comments: mi.comments = comments cdata = _get_cover(home, rdr) if cdata is not None: mi.cover_data = ('jpg', cdata) return mi
def do_add_empty(db, title, authors, isbn, tags, series, series_index): from calibre.ebooks.metadata import MetaInformation mi = MetaInformation(None) if title is not None: mi.title = title if authors: mi.authors = authors if isbn: mi.isbn = isbn if tags: mi.tags = tags if series: mi.series, mi.series_index = series, series_index db.import_book(mi, []) write_dirtied(db) send_message()
def get_metadata(stream, extract_cover=True): """ Return metadata as a L{MetaInfo} object """ mi = MetaInformation(_('Unknown'), [_('Unknown')]) stream.seek(0) pml = b'' if stream.name.endswith('.pmlz'): with TemporaryDirectory('_unpmlz') as tdir: zf = ZipFile(stream) zf.extractall(tdir) pmls = glob.glob(os.path.join(tdir, '*.pml')) for p in pmls: with open(p, 'r+b') as p_stream: pml += p_stream.read() if extract_cover: mi.cover_data = get_cover(os.path.splitext(os.path.basename(stream.name))[0], tdir, True) else: pml = stream.read() if extract_cover: mi.cover_data = get_cover(os.path.splitext(os.path.basename(stream.name))[0], os.path.abspath(os.path.dirname(stream.name))) for comment in re.findall(br'(?ms)\\v.*?\\v', pml): m = re.search(br'TITLE="(.*?)"', comment) if m: mi.title = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) m = re.search(br'AUTHOR="(.*?)"', comment) if m: if mi.authors == [_('Unknown')]: mi.authors = [] mi.authors.append(re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))) m = re.search(br'PUBLISHER="(.*?)"', comment) if m: mi.publisher = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) m = re.search(br'COPYRIGHT="(.*?)"', comment) if m: mi.rights = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) m = re.search(br'ISBN="(.*?)"', comment) if m: mi.isbn = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) return mi
def get_metadata(stream, extract_cover=True): """ Return metadata as a L{MetaInfo} object """ mi = MetaInformation(_('Unknown'), [_('Unknown')]) stream.seek(0) pml = '' if stream.name.endswith('.pmlz'): with TemporaryDirectory('_unpmlz') as tdir: zf = ZipFile(stream) zf.extractall(tdir) pmls = glob.glob(os.path.join(tdir, '*.pml')) for p in pmls: with open(p, 'r+b') as p_stream: pml += p_stream.read() if extract_cover: mi.cover_data = get_cover(os.path.splitext(os.path.basename(stream.name))[0], tdir, True) else: pml = stream.read() if extract_cover: mi.cover_data = get_cover(os.path.splitext(os.path.basename(stream.name))[0], os.path.abspath(os.path.dirname(stream.name))) for comment in re.findall(r'(?mus)\\v.*?\\v', pml): m = re.search(r'TITLE="(.*?)"', comment) if m: mi.title = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) m = re.search(r'AUTHOR="(.*?)"', comment) if m: if mi.authors == [_('Unknown')]: mi.authors = [] mi.authors.append(re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace')))) m = re.search(r'PUBLISHER="(.*?)"', comment) if m: mi.publisher = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) m = re.search(r'COPYRIGHT="(.*?)"', comment) if m: mi.rights = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) m = re.search(r'ISBN="(.*?)"', comment) if m: mi.isbn = re.sub('[\x00-\x1f]', '', prepare_string_for_xml(m.group(1).strip().decode('cp1252', 'replace'))) return mi
def do_add_empty(dbctx, title, authors, isbn, tags, series, series_index, cover, identifiers, languages): mi = MetaInformation(None) if title is not None: mi.title = title if authors: mi.authors = authors if identifiers: mi.set_identifiers(identifiers) if isbn: mi.isbn = isbn if tags: mi.tags = tags if series: mi.series, mi.series_index = series, series_index if cover: mi.cover = cover if languages: mi.languages = languages ids, duplicates = dbctx.run('add', 'empty', read_cover(mi)) prints(_('Added book ids: %s') % ','.join(map(str, ids)))
def do_add_empty( dbctx, title, authors, isbn, tags, series, series_index, cover, identifiers, languages ): mi = MetaInformation(None) if title is not None: mi.title = title if authors: mi.authors = authors if identifiers: mi.set_identifiers(identifiers) if isbn: mi.isbn = isbn if tags: mi.tags = tags if series: mi.series, mi.series_index = series, series_index if cover: mi.cover = cover if languages: mi.languages = languages ids, duplicates = dbctx.run('add', 'empty', read_cover(mi)) prints(_('Added book ids: %s') % ','.join(map(str, ids)))
def populate(self, entries, browser, verbose=False, api_key=''): for x in entries: try: id_url = entry_id(x)[0].text title = self.get_title(x) except: report(verbose) mi = MetaInformation(title, self.get_authors(x)) try: if api_key != '': id_url = id_url + "?apikey=" + api_key raw = browser.open(id_url).read() feed = etree.fromstring(raw) x = entry(feed)[0] except Exception, e: if verbose: print 'Failed to get all details for an entry' print e mi.comments = self.get_description(x, verbose) mi.tags = self.get_tags(x, verbose) mi.isbn = self.get_isbn(x, verbose) mi.publisher = self.get_publisher(x, verbose) mi.pubdate = self.get_date(x, verbose) self.append(mi)
def metadata_from_filename(name, pat=None, fallback_pat=None): if isbytestring(name): name = name.decode(filesystem_encoding, 'replace') name = name.rpartition('.')[0] mi = MetaInformation(None, None) if pat is None: pat = re.compile(prefs.get('filename_pattern')) name = name.replace('_', ' ') match = pat.search(name) if match is None and fallback_pat is not None: match = fallback_pat.search(name) if match is not None: try: mi.title = match.group('title') except IndexError: pass try: au = match.group('author') aus = string_to_authors(au) if aus: mi.authors = aus if prefs['swap_author_names'] and mi.authors: def swap(a): if ',' in a: parts = a.split(',', 1) else: parts = a.split(None, 1) if len(parts) > 1: t = parts[-1] parts = parts[:-1] parts.insert(0, t) return ' '.join(parts) mi.authors = [swap(x) for x in mi.authors] except (IndexError, ValueError): pass try: mi.series = match.group('series') except IndexError: pass try: si = match.group('series_index') mi.series_index = float(si) except (IndexError, ValueError, TypeError): pass try: si = match.group('isbn') mi.isbn = si except (IndexError, ValueError): pass try: publisher = match.group('publisher') mi.publisher = publisher except (IndexError, ValueError): pass try: pubdate = match.group('published') if pubdate: from calibre.utils.date import parse_only_date mi.pubdate = parse_only_date(pubdate) except: pass try: comments = match.group('comments') mi.comments = comments except (IndexError, ValueError): pass if mi.is_null('title'): mi.title = name return mi
def get_metadata_(src, encoding=None): if not isinstance(src, unicode): if not encoding: src = xml_to_unicode(src)[0] else: src = src.decode(encoding, 'replace') # Meta data definitions as in # http://www.mobileread.com/forums/showpost.php?p=712544&postcount=9 # Title title = None pat = re.compile(r'<!--.*?TITLE=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL) src = src[:150000] # Searching shouldn't take too long match = pat.search(src) if match: title = match.group(2) else: for x in ('DC.title', 'DCTERMS.title', 'Title'): pat = get_meta_regexp_(x) match = pat.search(src) if match: title = match.group(1) break if not title: pat = re.compile('<title>([^<>]+?)</title>', re.IGNORECASE) match = pat.search(src) if match: title = match.group(1) # Author author = None pat = re.compile(r'<!--.*?AUTHOR=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL) match = pat.search(src) if match: author = match.group(2).replace(',', ';') else: for x in ('Author', 'DC.creator.aut', 'DCTERMS.creator.aut', 'DC.creator'): pat = get_meta_regexp_(x) match = pat.search(src) if match: author = match.group(1) break # Create MetaInformation with Title and Author ent_pat = re.compile(r'&(\S+)?;') if title: title = ent_pat.sub(entity_to_unicode, title) if author: author = ent_pat.sub(entity_to_unicode, author) mi = MetaInformation(title, [author] if author else None) # Publisher publisher = None pat = re.compile(r'<!--.*?PUBLISHER=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL) match = pat.search(src) if match: publisher = match.group(2) else: for x in ('Publisher', 'DC.publisher', 'DCTERMS.publisher'): pat = get_meta_regexp_(x) match = pat.search(src) if match: publisher = match.group(1) break if publisher: mi.publisher = ent_pat.sub(entity_to_unicode, publisher) # ISBN isbn = None pat = re.compile(r'<!--.*?ISBN=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL) match = pat.search(src) if match: isbn = match.group(1) else: for x in ('ISBN', 'DC.identifier.ISBN', 'DCTERMS.identifier.ISBN'): pat = get_meta_regexp_(x) match = pat.search(src) if match: isbn = match.group(1) break if isbn: mi.isbn = re.sub(r'[^0-9xX]', '', isbn) # LANGUAGE language = None pat = re.compile(r'<!--.*?LANGUAGE=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL) match = pat.search(src) if match: language = match.group(1) else: for x in ('DC.language', 'DCTERMS.language'): pat = get_meta_regexp_(x) match = pat.search(src) if match: language = match.group(1) break if language: mi.language = language # PUBDATE pubdate = None pat = re.compile(r'<!--.*?PUBDATE=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL) match = pat.search(src) if match: pubdate = match.group(1) else: for x in ('Pubdate', 'Date of publication', 'DC.date.published', 'DC.date.publication', 'DC.date.issued', 'DCTERMS.issued'): pat = get_meta_regexp_(x) match = pat.search(src) if match: pubdate = match.group(1) break if pubdate: try: mi.pubdate = parse_date(pubdate) except: pass # TIMESTAMP timestamp = None pat = re.compile(r'<!--.*?TIMESTAMP=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL) match = pat.search(src) if match: timestamp = match.group(1) else: for x in ('Timestamp', 'Date of creation', 'DC.date.created', 'DC.date.creation', 'DCTERMS.created'): pat = get_meta_regexp_(x) match = pat.search(src) if match: timestamp = match.group(1) break if timestamp: try: mi.timestamp = parse_date(timestamp) except: pass # SERIES series = None pat = re.compile(r'<!--.*?SERIES=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL) match = pat.search(src) if match: series = match.group(1) else: pat = get_meta_regexp_("Series") match = pat.search(src) if match: series = match.group(1) if series: pat = re.compile(r'\[([.0-9]+)\]') match = pat.search(series) series_index = None if match is not None: try: series_index = float(match.group(1)) except: pass series = series.replace(match.group(), '').strip() mi.series = ent_pat.sub(entity_to_unicode, series) if series_index is None: pat = get_meta_regexp_("Seriesnumber") match = pat.search(src) if match: try: series_index = float(match.group(1)) except: pass if series_index is not None: mi.series_index = series_index # RATING rating = None pat = re.compile(r'<!--.*?RATING=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL) match = pat.search(src) if match: rating = match.group(1) else: pat = get_meta_regexp_("Rating") match = pat.search(src) if match: rating = match.group(1) if rating: try: mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 if mi.rating > 5: mi.rating /= 2. if mi.rating > 5: mi.rating = 0 except: pass # COMMENTS comments = None pat = re.compile(r'<!--.*?COMMENTS=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL) match = pat.search(src) if match: comments = match.group(1) else: pat = get_meta_regexp_("Comments") match = pat.search(src) if match: comments = match.group(1) if comments: mi.comments = ent_pat.sub(entity_to_unicode, comments) # TAGS tags = None pat = re.compile(r'<!--.*?TAGS=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL) match = pat.search(src) if match: tags = match.group(1) else: pat = get_meta_regexp_("Tags") match = pat.search(src) if match: tags = match.group(1) if tags: mi.tags = [ x.strip() for x in ent_pat.sub(entity_to_unicode, tags).split(",") ] # Ready to return MetaInformation return mi
def isbn_api_add(self, isbn): ''' Add the book and return the new db id. ''' try: isbn = int(re.sub(r'[^\d]+', '', isbn)) except: raise cherrypy.HTTPError(404, 'Invalid isbn %s is not a number: '%isbn) isbn_len = len(str(isbn)) if isbn_len != 10 and isbn_len != 13: raise cherrypy.HTTPError(404, 'Invalid isbn {0} has a wrong length of {1}'.format(isbn, isbn_len)) mi = MetaInformation(None) mi.isbn = str(isbn) fmts = [] new_id = 0 try: new_id = self.db.import_book(mi, fmts) except: return 'could not add new book with isdb {0}'.format(isbn) # Start the threaded download of metadata and return with the id of the added book result = '' try: ids = [] ids.append(new_id) buf = BytesIO() log = create_log(buf) abort = Event() authors = [] identifiers = {} identifiers['isbn'] = mi.isbn results = identify(log, abort, title=None, authors=authors, identifiers=identifiers, timeout=int(30000)) if not results: print (log, file=sys.stderr) prints('No results found', file=sys.stderr) return 'Could not find metadata for isbn {0}'.format(isbn) result = results[0] self.db.set_metadata(new_id, result) #cf = None # #if opts.cover and results: # cover = download_cover(log, title=None, authors=authors, # identifiers=result.identifiers, timeout=int(30000)) # if cover is None and not opts.opf: # prints('No cover found', file=sys.stderr) # else: # save_cover_data_to(cover[-1], opts.cover) # result.cover = cf = opts.cover log = buf.getvalue() result = unicode(result).encode('utf-8') except e: return 'Error getting metadata {0}'.format(e) return 'Added new book with isbn {0} with new id {1} and metadata {2}'.format(isbn, new_id, result) # }}}
def get_metadata_(src, encoding=None): if not isinstance(src, unicode): if not encoding: src = xml_to_unicode(src)[0] else: src = src.decode(encoding, "replace") # Meta data definitions as in # http://www.mobileread.com/forums/showpost.php?p=712544&postcount=9 # Title title = None pat = re.compile(r'<!--.*?TITLE=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL) src = src[:150000] # Searching shouldn't take too long match = pat.search(src) if match: title = match.group(2) else: for x in ("DC.title", "DCTERMS.title", "Title"): pat = get_meta_regexp_(x) match = pat.search(src) if match: title = match.group(1) break if not title: pat = re.compile("<title>([^<>]+?)</title>", re.IGNORECASE) match = pat.search(src) if match: title = match.group(1) # Author author = None pat = re.compile(r'<!--.*?AUTHOR=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL) match = pat.search(src) if match: author = match.group(2).replace(",", ";") else: for x in ("Author", "DC.creator.aut", "DCTERMS.creator.aut", "DC.creator"): pat = get_meta_regexp_(x) match = pat.search(src) if match: author = match.group(1) break # Create MetaInformation with Title and Author ent_pat = re.compile(r"&(\S+)?;") if title: title = ent_pat.sub(entity_to_unicode, title) if author: author = ent_pat.sub(entity_to_unicode, author) mi = MetaInformation(title, [author] if author else None) # Publisher publisher = None pat = re.compile(r'<!--.*?PUBLISHER=(?P<q>[\'"])(.+?)(?P=q).*?-->', re.DOTALL) match = pat.search(src) if match: publisher = match.group(2) else: for x in ("Publisher", "DC.publisher", "DCTERMS.publisher"): pat = get_meta_regexp_(x) match = pat.search(src) if match: publisher = match.group(1) break if publisher: mi.publisher = ent_pat.sub(entity_to_unicode, publisher) # ISBN isbn = None pat = re.compile(r'<!--.*?ISBN=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL) match = pat.search(src) if match: isbn = match.group(1) else: for x in ("ISBN", "DC.identifier.ISBN", "DCTERMS.identifier.ISBN"): pat = get_meta_regexp_(x) match = pat.search(src) if match: isbn = match.group(1) break if isbn: mi.isbn = re.sub(r"[^0-9xX]", "", isbn) # LANGUAGE language = None pat = re.compile(r'<!--.*?LANGUAGE=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL) match = pat.search(src) if match: language = match.group(1) else: for x in ("DC.language", "DCTERMS.language"): pat = get_meta_regexp_(x) match = pat.search(src) if match: language = match.group(1) break if language: mi.language = language # PUBDATE pubdate = None pat = re.compile(r'<!--.*?PUBDATE=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL) match = pat.search(src) if match: pubdate = match.group(1) else: for x in ( "Pubdate", "Date of publication", "DC.date.published", "DC.date.publication", "DC.date.issued", "DCTERMS.issued", ): pat = get_meta_regexp_(x) match = pat.search(src) if match: pubdate = match.group(1) break if pubdate: try: mi.pubdate = parse_date(pubdate) except: pass # TIMESTAMP timestamp = None pat = re.compile(r'<!--.*?TIMESTAMP=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL) match = pat.search(src) if match: timestamp = match.group(1) else: for x in ("Timestamp", "Date of creation", "DC.date.created", "DC.date.creation", "DCTERMS.created"): pat = get_meta_regexp_(x) match = pat.search(src) if match: timestamp = match.group(1) break if timestamp: try: mi.timestamp = parse_date(timestamp) except: pass # SERIES series = None pat = re.compile(r'<!--.*?SERIES=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL) match = pat.search(src) if match: series = match.group(1) else: pat = get_meta_regexp_("Series") match = pat.search(src) if match: series = match.group(1) if series: pat = re.compile(r"\[([.0-9]+)\]") match = pat.search(series) series_index = None if match is not None: try: series_index = float(match.group(1)) except: pass series = series.replace(match.group(), "").strip() mi.series = ent_pat.sub(entity_to_unicode, series) if series_index is None: pat = get_meta_regexp_("Seriesnumber") match = pat.search(src) if match: try: series_index = float(match.group(1)) except: pass if series_index is not None: mi.series_index = series_index # RATING rating = None pat = re.compile(r'<!--.*?RATING=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL) match = pat.search(src) if match: rating = match.group(1) else: pat = get_meta_regexp_("Rating") match = pat.search(src) if match: rating = match.group(1) if rating: try: mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 if mi.rating > 5: mi.rating /= 2.0 if mi.rating > 5: mi.rating = 0 except: pass # COMMENTS comments = None pat = re.compile(r'<!--.*?COMMENTS=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL) match = pat.search(src) if match: comments = match.group(1) else: pat = get_meta_regexp_("Comments") match = pat.search(src) if match: comments = match.group(1) if comments: mi.comments = ent_pat.sub(entity_to_unicode, comments) # TAGS tags = None pat = re.compile(r'<!--.*?TAGS=[\'"]([^"\']+)[\'"].*?-->', re.DOTALL) match = pat.search(src) if match: tags = match.group(1) else: pat = get_meta_regexp_("Tags") match = pat.search(src) if match: tags = match.group(1) if tags: mi.tags = [x.strip() for x in ent_pat.sub(entity_to_unicode, tags).split(",")] # Ready to return MetaInformation return mi
def get_metadata(stream, extract_cover=True): whitespace = re.compile(r'\s+') def normalize(s): return whitespace.sub(' ', s).strip() with ZipFile(stream) as zf: meta = zf.read('meta.xml') root = fromstring(meta) def find(field): ns, tag = fields[field] ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns}) if ans: return normalize( tostring(ans[0], method='text', encoding='unicode', with_tail=False)).strip() mi = MetaInformation(None, []) title = find('title') if title: mi.title = title creator = find('initial-creator') or find('creator') if creator: mi.authors = string_to_authors(creator) desc = find('description') if desc: mi.comments = desc lang = find('language') if lang and canonicalize_lang(lang): mi.languages = [canonicalize_lang(lang)] kw = find('keyword') or find('keywords') if kw: mi.tags = [x.strip() for x in kw.split(',') if x.strip()] data = {} for tag in root.xpath('//ns0:user-defined', namespaces={'ns0': fields['user-defined'][0]}): name = (tag.get('{%s}name' % METANS) or '').lower() vtype = tag.get('{%s}value-type' % METANS) or 'string' val = tag.text if name and val: if vtype == 'boolean': val = val == 'true' data[name] = val opfmeta = False # we need this later for the cover opfnocover = False if data.get('opf.metadata'): # custom metadata contains OPF information opfmeta = True if data.get('opf.titlesort', ''): mi.title_sort = data['opf.titlesort'] if data.get('opf.authors', ''): mi.authors = string_to_authors(data['opf.authors']) if data.get('opf.authorsort', ''): mi.author_sort = data['opf.authorsort'] if data.get('opf.isbn', ''): isbn = check_isbn(data['opf.isbn']) if isbn is not None: mi.isbn = isbn if data.get('opf.publisher', ''): mi.publisher = data['opf.publisher'] if data.get('opf.pubdate', ''): mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True) if data.get('opf.identifiers'): try: mi.identifiers = json.loads(data['opf.identifiers']) except Exception: pass if data.get('opf.rating'): try: mi.rating = max(0, min(float(data['opf.rating']), 10)) except Exception: pass if data.get('opf.series', ''): mi.series = data['opf.series'] if data.get('opf.seriesindex', ''): try: mi.series_index = float(data['opf.seriesindex']) except Exception: mi.series_index = 1.0 if data.get('opf.language', ''): cl = canonicalize_lang(data['opf.language']) if cl: mi.languages = [cl] opfnocover = data.get('opf.nocover', False) if not opfnocover: try: read_cover(stream, zf, mi, opfmeta, extract_cover) except Exception: pass # Do not let an error reading the cover prevent reading other data return mi
def metadata_from_filename(name, pat=None, fallback_pat=None): if isbytestring(name): name = name.decode(filesystem_encoding, "replace") name = name.rpartition(".")[0] mi = MetaInformation(None, None) if pat is None: pat = re.compile(prefs.get("filename_pattern")) name = name.replace("_", " ") match = pat.search(name) if match is None and fallback_pat is not None: match = fallback_pat.search(name) if match is not None: try: mi.title = match.group("title") except IndexError: pass try: au = match.group("author") aus = string_to_authors(au) if aus: mi.authors = aus if prefs["swap_author_names"] and mi.authors: def swap(a): if "," in a: parts = a.split(",", 1) else: parts = a.split(None, 1) if len(parts) > 1: t = parts[-1] parts = parts[:-1] parts.insert(0, t) return " ".join(parts) mi.authors = [swap(x) for x in mi.authors] except (IndexError, ValueError): pass try: mi.series = match.group("series") except IndexError: pass try: si = match.group("series_index") mi.series_index = float(si) except (IndexError, ValueError, TypeError): pass try: si = match.group("isbn") mi.isbn = si except (IndexError, ValueError): pass try: publisher = match.group("publisher") mi.publisher = publisher except (IndexError, ValueError): pass try: pubdate = match.group("published") if pubdate: from calibre.utils.date import parse_only_date mi.pubdate = parse_only_date(pubdate) except: pass if mi.is_null("title"): mi.title = name return mi