def extract_cover_thumbs(kindlepath): extensions = ('.azw', '.azw3', '.mobi', '.pobi', '.kfx', '.azw8') conn = sqlite3.connect(baza) c = conn.cursor() c.execute('SELECT {cn}, {coi} FROM {tn} WHERE {coi} IS NOT NULL AND {coi} != "0"'.\ format(coi=miniatura, tn=tablica, cn=polozenie)) files = c.fetchall() c.execute('SELECT {cn} FROM {tn} WHERE {cn} IS NOT NULL AND {cn} != "0" AND {te} != "Entry:Item:Dictionary"'.\ format(coi=miniatura, tn=tablica, cn=polozenie, te=typebooka)) ebooki = c.fetchall() conn.close() generate_apnx_files(ebooki) for names in files: name = names[0] thumbpath = names[1] if os.path.isfile(name.encode('UTF-8')): if name.lower().endswith(extensions): if name.lower().endswith('.kfx') or name.lower().endswith( '.azw8'): is_kfx = True else: is_kfx = False if (not os.path.isfile(thumbpath)): fide = name.encode('UTF-8') if is_kfx: if '_sample' in fide: continue try: kfx_metadata = get_kindle_kfx_metadata(name) except Exception as e: continue else: with open(name.encode('UTF-8'), 'rb') as mf: mobi_content = mf.read() if mobi_content[60:68] != 'BOOKMOBI': continue section = kindle_unpack.Sectionizer( name.encode('UTF-8')) mhlst = [kindle_unpack.MobiHeader(section, 0)] mh = mhlst[0] metadata = mh.getmetadata() if is_kfx: image_data = kfx_metadata.get("cover_image_data") if not image_data: continue try: if is_kfx: cover = process_image(image_data.decode('base64')) else: cover = get_cover_image(section, mh, metadata) except IOError: continue if not cover: continue cover.save(thumbpath) return 0
def write_apnx(self, mobi_file_path, apnx_path, page_count=0): import uuid apnx_meta = { 'guid': str(uuid.uuid4()).replace('-', '')[:8], 'asin': '', 'cdetype': 'EBOK', 'format': 'MOBI_7', 'acr': '' } try: with open(mobi_file_path, 'rb') as mf: ident = PdbHeaderReader(mf).identity() if ident != 'BOOKMOBI': return 1 apnx_meta['acr'] = str(PdbHeaderReader(mf).name()) except: return 1 with open(mobi_file_path, 'rb') as mf: section = kindle_unpack.Sectionizer(mobi_file_path) mhlst = [kindle_unpack.MobiHeader(section, 0)] mh = mhlst[0] metadata = mh.getmetadata() if mh.version == 8: apnx_meta['format'] = 'MOBI_8' else: apnx_meta['format'] = 'MOBI_7' try: if metadata['Document Type'][0] is None: apnx_meta['cdetype'] = 'EBOK' else: apnx_meta['cdetype'] = 'EBOK' apnx_meta['cdetype'] = metadata['Document Type'][0] except KeyError: apnx_meta['cdetype'] = 'EBOK' try: if metadata['ASIN'][0] is None: apnx_meta['asin'] = '' else: apnx_meta['asin'] = metadata['ASIN'][0] except KeyError: apnx_meta['asin'] = '' pages = [] if page_count: pages = self.get_pages_exact(mobi_file_path, page_count) else: pages = self.get_pages_fast(mobi_file_path) if not pages: pages = self.get_pages_fast(mobi_file_path) if len(pages) > 65536: return apnx = self.generate_apnx(pages, apnx_meta) with open(apnx_path, 'wb') as apnxf: apnxf.write(apnx)
def write_apnx(self, mobi_file_path, apnx_path, page_count=0): """ Write APNX file. If you want a fixed number of pages (such as from a custom column) then pass in a value to page_count, otherwise a count will be estimated using either the fast or accurate algorithm. """ import uuid apnx_meta = {'guid': str(uuid.uuid4()).replace('-', '')[:8], 'asin': '', 'cdetype': 'EBOK', 'format': 'MOBI_7', 'acr': ''} try: with open(mobi_file_path, 'rb') as mf: ident = PdbHeaderReader(mf).identity() if ident != 'BOOKMOBI': # Check that this is really a MOBI file. print('BŁĄD! Niepoprawny plik MOBI "%s"' % os.path.basename(mobi_file_path)) return 1 apnx_meta['acr'] = str(PdbHeaderReader(mf).name()) except: print('Błąd! Nie można otworzyć pliku %s' % mobi_file_path) return 1 with open(mobi_file_path, 'rb') as mf: section = kindle_unpack.Sectionizer(mobi_file_path) mhlst = [kindle_unpack.MobiHeader(section, 0)] mh = mhlst[0] metadata = mh.getmetadata() if mh.version == 8: apnx_meta['format'] = 'MOBI_8' else: apnx_meta['format'] = 'MOBI_7' try: if metadata['Document Type'][0] is None: apnx_meta['cdetype'] = 'EBOK' else: apnx_meta['cdetype'] = 'EBOK' apnx_meta['cdetype'] = metadata['Document Type'][0] except KeyError: apnx_meta['cdetype'] = 'EBOK' try: if metadata['ASIN'][0] is None: apnx_meta['asin'] = '' else: apnx_meta['asin'] = metadata['ASIN'][0] except KeyError: apnx_meta['asin'] = '' pages = [] if page_count: pages = self.get_pages_exact(mobi_file_path, page_count) else: pages = self.get_pages_fast(mobi_file_path) if not pages: pages = self.get_pages_fast(mobi_file_path) if not pages: print('Nie można wygenerować mapowania stron.') if len(pages) > 65536: print('Ilośc stron w pliku in "%s" przekracza limit. ' 'Nie można zapisać pliku apnx...' % mobi_file_path) return apnx = self.generate_apnx(pages, apnx_meta) if sys.platform == 'win32': apnx_path = '\\\\?\\' + apnx_path.replace('/', '\\') with open(apnx_path, 'wb') as apnxf: apnxf.write(apnx)
def extract_cover_thumbs(is_silent, is_overwrite_pdoc_thumbs, is_overwrite_amzn_thumbs, is_overwrite_apnx, skip_apnx, kindlepath, is_azw, days, fix_thumb): docs = os.path.join(kindlepath, 'documents') is_verbose = not is_silent try: dir_list = os.listdir(docs) dir_list.sort() except: print('* ERROR! No Kindle device found in a specified directory: ' + kindlepath) return 1 if days is not None: dtt = datetime.today() days_int = int(days) print('Notice! Processing files not older than ' + days + ' days.') else: days_int = 0 diff = 0 # move CSV file to computer temp dir to speed up updating process tempdir = tempfile.mkdtemp(suffix='', prefix='extract_cover_thumbs-tmp-') csv_pages_name = 'extract_cover_thumbs-book-pages.csv' csv_pages = os.path.join(tempdir, csv_pages_name) if os.path.isfile(os.path.join(docs, csv_pages_name)): shutil.copy2(os.path.join(docs, csv_pages_name), os.path.join(tempdir, csv_pages_name)) # load ASIN list from CSV asinlist = asin_list_from_csv(csv_pages) if not os.path.isdir(os.path.join(kindlepath, 'system', 'thumbnails')): print('* ERROR! No Kindle device found in the specified path: "' + os.path.join(kindlepath) + '"') return 1 print("START of extracting cover thumbnails...") if is_azw: extensions = ('.azw', '.azw3', '.mobi') else: extensions = ('.azw3', '.mobi') for f in dir_list: if days is not None: dt = os.path.getctime(os.path.join(docs, f)) dt = datetime.fromtimestamp(dt).strftime('%Y-%m-%d') dt = datetime.strptime(dt, '%Y-%m-%d') diff = (dtt - dt).days if f.lower().endswith(extensions) and diff <= days_int: fide = f.decode(sys.getfilesystemencoding()) mobi_path = os.path.join(docs, f) dump_pages(asinlist, csv_pages, docs, f) if is_verbose: try: print('* %s:' % fide, end=' ') except: print('* %r:' % fide, end=' ') with open(mobi_path, 'rb') as mf: mobi_content = mf.read() if mobi_content[60:68] != 'BOOKMOBI': print('* Not a valid MOBI file "%s".' % fide) continue section = kindle_unpack.Sectionizer(mobi_path) mhlst = [kindle_unpack.MobiHeader(section, 0)] mh = mhlst[0] metadata = mh.getmetadata() try: asin = metadata['ASIN'][0] except KeyError: asin = None try: doctype = metadata['Document Type'][0] except KeyError: doctype = None if asin is None: print('ERROR! No ASIN found in "%s"' % fide) continue thumbpath = os.path.join( kindlepath, 'system', 'thumbnails', 'thumbnail_%s_%s_portrait.jpg' % (asin, doctype) ) if (not os.path.isfile(thumbpath) or (is_overwrite_pdoc_thumbs and doctype == 'PDOC') or (is_overwrite_amzn_thumbs and ( doctype == 'EBOK' or doctype == 'EBSP' ))): if is_verbose: print('EXTRACTING COVER:', end=' ') try: cover = get_cover_image(section, mh, metadata, doctype, f, fide, is_verbose, fix_thumb) except IOError: print('FAILED! Image format unrecognized...') continue if not cover: continue cover.save(thumbpath) elif is_verbose: print('skipped (cover present or overwriting not forced).') if True: print("START of downloading real book page numbers...") get_real_pages(os.path.join( tempdir, 'extract_cover_thumbs-book-pages.csv')) print("FINISH of downloading real book page numbers...") if not skip_apnx: print("START of generating book page numbers (APNX files)...") generate_apnx_files(dir_list, docs, is_verbose, is_overwrite_apnx, days, tempdir) print("FINISH of generating book page numbers (APNX files)...") if is_overwrite_pdoc_thumbs: thumb_dir = os.path.join(kindlepath, 'system', 'thumbnails') thumb_list = os.listdir(thumb_dir) for c in thumb_list: if c.startswith('thumbnail') and c.endswith('.jpg'): if c.endswith('portrait.jpg'): continue fix_generated_thumbs(os.path.join(thumb_dir, c), is_verbose, fix_thumb) print("FINISH of extracting cover thumbnails...") shutil.copy2(os.path.join(tempdir, csv_pages_name), os.path.join(docs, csv_pages_name)) clean_temp(tempdir) return 0
def write_apnx(self, mobi_file_path, apnx_path, page_count=0): """ Write APNX file. If you want a fixed number of pages (such as from a custom column) then pass in a value to page_count, otherwise a count will be estimated using either the fast or accurate algorithm. """ import uuid apnx_meta = {'guid': str(uuid.uuid4()).replace('-', '')[:8], 'asin': '', 'cdetype': 'EBOK', 'format': 'MOBI_7', 'acr': ''} try: with open(mobi_file_path, 'rb') as mf: ident = PdbHeaderReader(mf).identity() if ident != 'BOOKMOBI': # Check that this is really a MOBI file. print('ERROR! Not a valid MOBI file "%s"' % os.path.basename(mobi_file_path)) return 1 apnx_meta['acr'] = str(PdbHeaderReader(mf).name()) except: print('Error! Unable to open file %s' % mobi_file_path) return 1 # We'll need the PDB name, the MOBI version, and some metadata to make # FW 3.4 happy with KF8 files... with open(mobi_file_path, 'rb') as mf: section = kindle_unpack.Sectionizer(mobi_file_path) mhlst = [kindle_unpack.MobiHeader(section, 0)] mh = mhlst[0] metadata = mh.getmetadata() if mh.version == 8: apnx_meta['format'] = 'MOBI_8' else: apnx_meta['format'] = 'MOBI_7' try: if metadata['Document Type'][0] is None: apnx_meta['cdetype'] = 'EBOK' else: apnx_meta['cdetype'] = 'EBOK' apnx_meta['cdetype'] = metadata['Document Type'][0] except KeyError: apnx_meta['cdetype'] = 'EBOK' try: if metadata['ASIN'][0] is None: apnx_meta['asin'] = '' else: apnx_meta['asin'] = metadata['ASIN'][0] except KeyError: apnx_meta['asin'] = '' # Get the pages depending on the chosen parser pages = [] if page_count: pages = self.get_pages_exact(mobi_file_path, page_count) else: pages = self.get_pages_fast(mobi_file_path) if not pages: pages = self.get_pages_fast(mobi_file_path) if not pages: print('Could not generate page mapping.') if len(pages) > 65536: print('Pages over limit in "%s" file. ' 'Unable to write apnx file...' % mobi_file_path) return # Generate the APNX file from the page mapping. apnx = self.generate_apnx(pages, apnx_meta) # Write the APNX. if sys.platform == 'win32': apnx_path = '\\\\?\\' + apnx_path.replace('/', '\\') with open(apnx_path, 'wb') as apnxf: apnxf.write(apnx)
def extract_cover_thumbs(is_silent, is_overwrite_pdoc_thumbs, is_overwrite_amzn_thumbs, is_overwrite_apnx, skip_apnx, kindlepath, is_azw, days, fix_thumb, lubimy_czytac, mark_real_pages, patch_azw3): docs = os.path.join(kindlepath, 'documents') is_verbose = not is_silent if days is not None: dtt = datetime.today() days_int = int(days) print('Notice! Processing files not older than ' + days + ' days.') else: days_int = 0 diff = 0 # move CSV file to computer temp dir to speed up updating process tempdir = tempfile.mkdtemp(suffix='', prefix='extract_cover_thumbs-tmp-') csv_pages_name = 'extract_cover_thumbs_book_pages2.csv' csv_pages = os.path.join(tempdir, csv_pages_name) if os.path.isfile(os.path.join(docs, csv_pages_name)): shutil.copy2(os.path.join(docs, csv_pages_name), os.path.join(tempdir, csv_pages_name)) # load ASIN list from CSV asinlist, filelist = asin_list_from_csv(csv_pages) if not os.path.isdir(os.path.join(kindlepath, 'system', 'thumbnails')): print('* ERROR! No Kindle device found in the specified path: "' + os.path.join(kindlepath) + '"') return 1 print("START of extracting cover thumbnails...") if is_azw: extensions = ('.azw', '.azw3', '.mobi', '.kfx', '.azw8') else: extensions = ('.azw3', '.mobi', '.kfx', '.azw8') for root, dirs, files in os.walk(docs): for name in files: if 'documents' + os.path.sep + 'dictionaries' in root: if is_verbose: print('! Excluded dictionary:', name) continue if days is not None: try: dt = os.path.getctime(os.path.join(root, name)) except OSError: continue dt = datetime.fromtimestamp(dt).strftime('%Y-%m-%d') dt = datetime.strptime(dt, '%Y-%m-%d') diff = (dtt - dt).days if name.lower().endswith(extensions) and diff <= days_int: if name.lower().endswith('.kfx') or name.lower().endswith('.azw8'): is_kfx = True else: is_kfx = False fide = name.decode(sys.getfilesystemencoding()) if is_verbose: try: print('* %s:' % fide, end=' ') except: print('* %r:' % fide, end=' ') mobi_path = os.path.join(root, name) if "attachables" in mobi_path: continue if is_kfx: if '_sample' in fide: if is_verbose: print('KFX Sample. Skipping...') continue try: kfx_metadata = get_kindle_kfx_metadata(mobi_path) except Exception as e: print('ERROR! Extracting metadata from %s: %s' % ( fide, unicode(e) )) continue doctype = kfx_metadata.get("cde_content_type") if not doctype: print('ERROR! No document type found in "%s"' % fide) continue asin = kfx_metadata.get("ASIN") else: if '!DeviceUpgradeLetter!' in fide: if is_verbose: print('Upgrade Letter. Skipping...') continue dump_pages(asinlist, filelist, csv_pages, root, name, is_verbose) with open(mobi_path, 'rb') as mf: mobi_content = mf.read() if mobi_content[60:68] != 'BOOKMOBI': print('* Not a valid MOBI file "%s".' % fide) continue section = kindle_unpack.Sectionizer(mobi_path) mhlst = [kindle_unpack.MobiHeader(section, 0)] mh = mhlst[0] metadata = mh.getmetadata() try: asin = metadata['ASIN'][0] except KeyError: asin = None try: doctype = metadata['Document Type'][0] except KeyError: doctype = None if (patch_azw3 is True and doctype == 'PDOC' and asin is not None and name.lower().endswith('.azw3')): print("PATCHING AZW3", end=' ') dmf = DualMobiMetaFix(mobi_path) open(mobi_path, 'wb').write(dmf.getresult()) doctype = 'EBOK' if asin is None: print('ERROR! No ASIN found in "%s"' % fide) continue thumbpath = os.path.join( kindlepath, 'system', 'thumbnails', 'thumbnail_%s_%s_portrait.jpg' % (asin, doctype) ) if (not os.path.isfile(thumbpath) or (is_overwrite_pdoc_thumbs and doctype == 'PDOC') or (is_overwrite_amzn_thumbs and ( doctype == 'EBOK' or doctype == 'EBSP' ))): if is_kfx: image_data = kfx_metadata.get("cover_image_data") if not image_data: print('ERROR! No cover image found in "%s"' % fide) continue if is_verbose: print('PROCESSING COVER:', end=' ') try: if is_kfx: cover = process_image(image_data.decode('base64'), fix_thumb, doctype, is_verbose) else: cover = get_cover_image(section, mh, metadata, doctype, name, fide, is_verbose, fix_thumb) except IOError: print('FAILED! Image format unrecognized...') continue if not cover: continue cover.save(thumbpath) elif is_verbose: print('skipped (cover present or overwriting not forced).') if lubimy_czytac and days: print("START of downloading real book page numbers...") get_real_pages(os.path.join( tempdir, 'extract_cover_thumbs_book_pages2.csv'), mark_real_pages) print("FINISH of downloading real book page numbers...") if not skip_apnx: print("START of generating book page numbers (APNX files)...") generate_apnx_files(docs, is_verbose, is_overwrite_apnx, days, tempdir) print("FINISH of generating book page numbers (APNX files)...") if is_overwrite_pdoc_thumbs: thumb_dir = os.path.join(kindlepath, 'system', 'thumbnails') thumb_list = os.listdir(thumb_dir) for c in thumb_list: if c.startswith('thumbnail') and c.endswith('.jpg'): if c.endswith('portrait.jpg'): continue fix_generated_thumbs(os.path.join(thumb_dir, c), is_verbose, fix_thumb) print("FINISH of extracting cover thumbnails...") shutil.copy2(os.path.join(tempdir, csv_pages_name), os.path.join(docs, csv_pages_name)) clean_temp(tempdir) return 0