def get_metadata(stream, cpath=None): if not podofo: raise Unavailable(podofo_err) pt = PersistentTemporaryFile('_podofo.pdf') pt.write(stream.read()) pt.close() server = Server(pool_size=1) job = ParallelJob('read_pdf_metadata', 'Read pdf metadata', lambda x,y:x, args=[pt.name, cpath]) server.add_job(job) while not job.is_finished: time.sleep(0.1) job.update() job.update() server.close() if job.result is None: raise ValueError('Failed to read metadata: ' + job.details) title, authors, creator, tags, ok = job.result if not ok: print 'Failed to extract cover:' print job.details if title == '_': title = getattr(stream, 'name', _('Unknown')) title = os.path.splitext(title)[0] mi = MetaInformation(title, authors) if creator: mi.book_producer = creator if tags: mi.tags = tags if os.path.exists(pt.name): os.remove(pt.name) if ok: mi.cover = cpath return mi
def get_metadata(stream, cover=True): with TemporaryDirectory('_pdf_metadata_read') as pdfpath: stream.seek(0) with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f: shutil.copyfileobj(stream, f) try: res = fork_job('calibre.ebooks.metadata.pdf', 'read_info', (pdfpath, bool(cover))) except WorkerError as e: prints(e.orig_tb) raise RuntimeError('Failed to run pdfinfo') info = res['result'] with open(res['stdout_stderr'], 'rb') as f: raw = f.read().strip() if raw: prints(raw) if not info: raise ValueError('Could not read info dict from PDF') covpath = os.path.join(pdfpath, 'cover.jpg') cdata = None if cover and os.path.exists(covpath): with open(covpath, 'rb') as f: cdata = f.read() title = info.get('Title', None) au = info.get('Author', None) if au is None: au = [_('Unknown')] else: au = string_to_authors(au) mi = MetaInformation(title, au) # if isbn is not None: # mi.isbn = isbn creator = info.get('Creator', None) if creator: mi.book_producer = creator keywords = info.get('Keywords', None) mi.tags = [] if keywords: mi.tags = [x.strip() for x in keywords.split(',')] isbn = [check_isbn(x) for x in mi.tags if check_isbn(x)] if isbn: mi.isbn = isbn = isbn[0] mi.tags = [x for x in mi.tags if check_isbn(x) != isbn] subject = info.get('Subject', None) if subject: mi.tags.insert(0, subject) if cdata: mi.cover_data = ('jpeg', cdata) return mi
def get_metadata(stream, cover=True): with TemporaryDirectory("_pdf_metadata_read") as pdfpath: stream.seek(0) with open(os.path.join(pdfpath, "src.pdf"), "wb") as f: shutil.copyfileobj(stream, f) try: res = fork_job("calibre.ebooks.metadata.pdf", "read_info", (pdfpath, bool(cover))) except WorkerError as e: prints(e.orig_tb) raise RuntimeError("Failed to run pdfinfo") info = res["result"] with open(res["stdout_stderr"], "rb") as f: raw = f.read().strip() if raw: prints(raw) if not info: raise ValueError("Could not read info dict from PDF") covpath = os.path.join(pdfpath, "cover.jpg") cdata = None if cover and os.path.exists(covpath): with open(covpath, "rb") as f: cdata = f.read() title = info.get("Title", None) au = info.get("Author", None) if au is None: au = [_("Unknown")] else: au = string_to_authors(au) mi = MetaInformation(title, au) # if isbn is not None: # mi.isbn = isbn creator = info.get("Creator", None) if creator: mi.book_producer = creator keywords = info.get("Keywords", None) mi.tags = [] if keywords: mi.tags = [x.strip() for x in keywords.split(",")] isbn = [check_isbn(x) for x in mi.tags if check_isbn(x)] if isbn: mi.isbn = isbn = isbn[0] mi.tags = [x for x in mi.tags if check_isbn(x) != isbn] subject = info.get("Subject", None) if subject: mi.tags.insert(0, subject) if cdata: mi.cover_data = ("jpeg", cdata) return mi
def get_metadata(stream, cover=True): with TemporaryDirectory('_pdf_metadata_read') as pdfpath: stream.seek(0) with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f: shutil.copyfileobj(stream, f) try: res = fork_job('calibre.ebooks.metadata.pdf', 'read_info', (pdfpath, bool(cover))) except WorkerError as e: prints(e.orig_tb) raise RuntimeError('Failed to run pdfinfo') info = res['result'] with open(res['stdout_stderr'], 'rb') as f: raw = f.read().strip() if raw: prints(raw) if not info: raise ValueError('Could not read info dict from PDF') covpath = os.path.join(pdfpath, 'cover.jpg') cdata = None if cover and os.path.exists(covpath): with open(covpath, 'rb') as f: cdata = f.read() title = info.get('Title', None) au = info.get('Author', None) if au is None: au = [_('Unknown')] else: au = string_to_authors(au) mi = MetaInformation(title, au) #if isbn is not None: # mi.isbn = isbn creator = info.get('Creator', None) if creator: mi.book_producer = creator keywords = info.get('Keywords', None) mi.tags = [] if keywords: mi.tags = [x.strip() for x in keywords.split(',')] subject = info.get('Subject', None) if subject: mi.tags.insert(0, subject) if cdata: mi.cover_data = ('jpeg', cdata) return mi
def get_metadata_quick(raw): p = podofo.PDFDoc() p.load(raw) title = p.title if not title: title = '_' author = p.author authors = string_to_authors(author) if author else [_('Unknown')] creator = p.creator try: tags = [x.strip() for x in p.keywords.split(u',')] tags = [x for x in tags if x] except: tags = [] mi = MetaInformation(title, authors) if creator: mi.book_producer = creator if tags: mi.tags = tags return mi
def get_metadata(stream, cpath=None): if not podofo: raise Unavailable(podofo_err) pt = PersistentTemporaryFile('_podofo.pdf') pt.write(stream.read()) pt.close() server = Server(pool_size=1) job = ParallelJob('read_pdf_metadata', 'Read pdf metadata', lambda x, y: x, args=[pt.name, cpath]) server.add_job(job) while not job.is_finished: time.sleep(0.1) job.update() job.update() server.close() if job.result is None: raise ValueError('Failed to read metadata: ' + job.details) title, authors, creator, tags, ok = job.result if not ok: print 'Failed to extract cover:' print job.details if title == '_': title = getattr(stream, 'name', _('Unknown')) title = os.path.splitext(title)[0] mi = MetaInformation(title, authors) if creator: mi.book_producer = creator if tags: mi.tags = tags if os.path.exists(pt.name): os.remove(pt.name) if ok: mi.cover = cpath return mi
def get_metadata(stream, cover=True): with TemporaryDirectory('_pdf_metadata_read') as pdfpath: stream.seek(0) with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f: shutil.copyfileobj(stream, f) try: res = fork_job('calibre.ebooks.metadata.pdf', 'read_info', (pdfpath, bool(cover))) except WorkerError as e: prints(e.orig_tb) raise RuntimeError('Failed to run pdfinfo') info = res['result'] with open(res['stdout_stderr'], 'rb') as f: raw = f.read().strip() if raw: prints(raw) if info is None: raise ValueError('Could not read info dict from PDF') covpath = os.path.join(pdfpath, 'cover.jpg') cdata = None if cover and os.path.exists(covpath): with open(covpath, 'rb') as f: cdata = f.read() title = info.get('Title', None) or _('Unknown') au = info.get('Author', None) if au is None: au = [_('Unknown')] else: au = string_to_authors(au) mi = MetaInformation(title, au) # if isbn is not None: # mi.isbn = isbn creator = info.get('Creator', None) if creator: mi.book_producer = creator keywords = info.get('Keywords', None) mi.tags = [] if keywords: mi.tags = [x.strip() for x in keywords.split(',')] isbn = [check_isbn(x) for x in mi.tags if check_isbn(x)] if isbn: mi.isbn = isbn = isbn[0] mi.tags = [x for x in mi.tags if check_isbn(x) != isbn] subject = info.get('Subject', None) if subject: mi.tags.insert(0, subject) if 'xmp_metadata' in info: from calibre.ebooks.metadata.xmp import consolidate_metadata mi = consolidate_metadata(mi, info) # Look for recognizable identifiers in the info dict, if they were not # found in the XMP metadata for scheme, check_func in iteritems({ 'doi': check_doi, 'isbn': check_isbn }): if scheme not in mi.get_identifiers(): for k, v in iteritems(info): if k != 'xmp_metadata': val = check_func(v) if val: mi.set_identifier(scheme, val) break if cdata: mi.cover_data = ('jpeg', cdata) return mi
def get_metadata(stream, cover=True): with TemporaryDirectory('_pdf_metadata_read') as pdfpath: stream.seek(0) with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f: shutil.copyfileobj(stream, f) try: res = fork_job('calibre.ebooks.metadata.pdf', 'read_info', (pdfpath, bool(cover))) except WorkerError as e: prints(e.orig_tb) raise RuntimeError('Failed to run pdfinfo') info = res['result'] with open(res['stdout_stderr'], 'rb') as f: raw = f.read().strip() if raw: prints(raw) if not info: raise ValueError('Could not read info dict from PDF') covpath = os.path.join(pdfpath, 'cover.jpg') cdata = None if cover and os.path.exists(covpath): with open(covpath, 'rb') as f: cdata = f.read() title = info.get('Title', None) au = info.get('Author', None) if au is None: au = [_('Unknown')] else: au = string_to_authors(au) mi = MetaInformation(title, au) # if isbn is not None: # mi.isbn = isbn creator = info.get('Creator', None) if creator: mi.book_producer = creator keywords = info.get('Keywords', None) mi.tags = [] if keywords: mi.tags = [x.strip() for x in keywords.split(',')] isbn = [check_isbn(x) for x in mi.tags if check_isbn(x)] if isbn: mi.isbn = isbn = isbn[0] mi.tags = [x for x in mi.tags if check_isbn(x) != isbn] subject = info.get('Subject', None) if subject: mi.tags.insert(0, subject) if 'xmp_metadata' in info: from calibre.ebooks.metadata.xmp import consolidate_metadata mi = consolidate_metadata(mi, info) # Look for recognizable identifiers in the info dict, if they were not # found in the XMP metadata for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems(): if scheme not in mi.get_identifiers(): for k, v in info.iteritems(): if k != 'xmp_metadata': val = check_func(v) if val: mi.set_identifier(scheme, val) break if cdata: mi.cover_data = ('jpeg', cdata) return mi