def dump(path, strip, meta, content): """Dump Tika extraction results for PATH (file or url path).""" media_type = mime_clean(mime_guess(path)) if media_type not in SUPPORTED_MIME_TYPES: click.echo("Unsupported media type {}.".format(media_type)) click.echo("Please request support at https://github.com/iscc/iscc-cli/issues") if media_type == "application/x-mobipocket-ebook": tempdir, epub_filepath = mobi.extract(path) tika_result = parser.from_file(epub_filepath) shutil.rmtree(tempdir) else: tika_result = parser.from_file(path) if all([meta, content]): raise UsageError("Use either --meta or --content for selective output.") if strip: tika_result["content"] = tika_result.get("content", "")[:strip] if meta: click.echo(json.dumps(tika_result.get("metadata", ""), indent=2)) elif content: click.echo(json.dumps(tika_result.get("content", ""), indent=2)) else: click.echo(json.dumps(tika_result, indent=2))
def extract_mobi_folder(bookdir, force=False): # used lists # mobi extracts the mobis into some temp dicts and this list will hold # the paths templist = [] # convlist will be holding directionary for after the conversion convlist = [] # create list of all mobis inside the bookdir mobilist = [ f.path for f in os.scandir(bookdir) if f.is_file() and os.path.splitext(f)[1] in ( ".mobi", ) and not os.path.isdir(os.path.splitext(f)[0]) ] if force: mobilist = glob(bookdir + "/*.mobi") # extract the mobis for f in mobilist: tempdir, _ = mobi.extract(f) templist.append(tempdir + "\\mobi7") # dictiorary names after conversion is just the filename # minus the extension for f in mobilist: convlist.append(os.path.splitext(f)[0]) # copy over the mobi file structure for i in range(len(templist)): copy_delcopy(templist[i], convlist[i]) # clean up for f in templist: shutil.rmtree(os.path.dirname(f)) return convlist
def test_extract(): for fname in os.listdir(TEST_DIR): ext = splitext(fname)[-1].upper() if ext in [".MOBI", ".PRC", ".AZW", ".AZW3", ".AZW4"]: tempdir, filepath = mobi.extract(join(TEST_DIR, fname)) assert exists(tempdir) assert exists(filepath) shutil.rmtree(tempdir)
def main(argv): input = sys.argv[1] if len(sys.argv) <= 2: output = input.replace('.mobi', '_unpacked_mobi') else: output = sys.argv[2] print ('Unpacking MOBI to {}'.format(output)) tempdir, _ = mobi.extract(input) shutil.move(tempdir, output)
def unpack_mobi(self, filename): storage_area = self.get_mobi_storage_area() filemd5 = generate_file_md5(filename) for fname in storage_area.iterdir(): if fname.is_file() and (fname.stem == filemd5): return str(fname) with mute_stdout(): tempdir, extracted_file = mobi.extract(str(filename)) filetype = Path(extracted_file).suffix.strip(".") if filetype == "html": return self.create_valid_epub_from_epub_like_structure( tempdir, storage_area.joinpath(f"{filemd5}.epub")) dst_filename = storage_area.joinpath(f"{filemd5}.{filetype}") shutil.copy(extracted_file, dst_filename) TemporaryDirectory._rmtree(tempdir) return dst_filename
def parse(dir, outPath): count = 0 for root, dirs, files in os.walk(dir): for file in files: try: #获取文件路径 path = os.path.join(root, file) # print("文件路径为"+path) if path.endswith(".mobi"): tempdir, filepath = mobi.extract(path) # filepath里面是一个html book = open(filepath, 'r', encoding='utf-8') soup = BeautifulSoup(book.read(), "html.parser") # 解析好的文件路径 filename = path.split("/")[-1] currentPath = os.getcwd() # print("当前路径为"+currentPath) # savePath = join(currentPath,"out",filename+".txt") savePath = join(outPath, filename + ".txt") # print("解析好的文件路径"+savePath) if os.path.exists(outPath): pass else: os.makedirs(join(currentPath, "out")) a = open(savePath, 'w', encoding='utf-8') a.write(soup.text) a.close() # print("删除临时目录"+tempdir) shutil.rmtree(tempdir) count = count + 1 if count % 10 == 0: print("处理了" + str(count)) except Exception as a: print(a) traceback.print_exc() continue
def import_mobi_file(self, text_file_path): tempdir, filepath = mobi.extract(text_file_path) # If extracted MOBI file has extension TXT or HTML that means that everything worked properly. if re.search(r'\S+.txt|\S+.html', filepath, re.IGNORECASE): file = open(filepath, 'r', errors='ignore') content = file.read() new_text = html2text.html2text(content.replace('\\n', '')) self.save_temp_data(new_text) shutil.rmtree(tempdir, ignore_errors=True) # In other case (for example extracted file has EPUB format) that means that MOBI file was encrypted and # content will be corrupted. else: self.text_loading_dialog.dismiss() self.show_instructions( 'Something went wrong :( The file provided cannot be processed. Please try another one.' ) self.update_text_preview() self.text_loading_dialog.dismiss()
def batch(path, recursive, guess, debug): """Create ISCC Codes for all files in PATH. Example: $ iscc batch ~/Documents """ if debug: log.add(sys.stdout) results = [] for f in get_files(path, recursive=recursive): filesize = os.path.getsize(f) if not filesize: msg = "Cannot proccess empty file: {}".format(f) log.warning(msg) continue media_type = mime_clean(mime_guess(f)) if media_type not in SUPPORTED_MIME_TYPES: fname = basename(f) msg = "Unsupported file {} with mime type: {},,,,".format( fname, media_type) log.warning(msg) continue if media_type == "application/x-mobipocket-ebook": try: tempdir, epub_filepath = mobi.extract(f) tika_result = parser.from_file(epub_filepath) shutil.rmtree(tempdir) except Exception as e: msg = "Error with mobi extraction %s" log.error(msg) continue else: tika_result = parser.from_file(f) title = get_title(tika_result, guess=guess, uri=f) mid, norm_title, _ = iscc.meta_id(title) gmt = mime_to_gmt(media_type, file_path=f) if gmt == GMT.IMAGE: try: cid = iscc.content_id_image(f) except Exception as e: msg = "Clould not proccess image: {} ({})".format(f, e) log.error(msg) continue elif gmt == GMT.TEXT: text = tika_result["content"] if not text: msg = "Could not extract text from {}".format(basename(f)) log.warning(msg) continue cid = iscc.content_id_text(tika_result["content"]) elif gmt == GMT.AUDIO: if not fpcalc.is_installed(): fpcalc.install() features = audio_id.get_chroma_vector(f) cid = audio_id.content_id_audio(features) elif gmt == GMT.VIDEO: features = video_id.get_frame_vectors(abspath(f)) cid = video_id.content_id_video(features) else: log.error("Could not generate ISCC") continue did = iscc.data_id(f) iid, tophash = iscc.instance_id(f) iscc_code_cs = ",".join((mid, cid, did, iid)) click.echo("{iscc_code},{tophash},{fname},{gmt},{title}".format( iscc_code=iscc_code_cs, tophash=tophash, fname=basename(f), gmt=gmt, title=norm_title, )) iscc_code = "-".join((mid, cid, did, iid)) results.append( dict( iscc=iscc_code, norm_title=norm_title, tophash=tophash, gmt=gmt, file_name=basename(f), )) return results
def gen(file, guess, title, extra, verbose): """Generate ISCC Code for FILE.""" filesize = os.path.getsize(file.name) if not filesize: raise click.BadParameter("Cannot proccess empty file: {}".format( file.name)) media_type = mime_clean(mime_guess(file.name)) if media_type not in SUPPORTED_MIME_TYPES: click.echo("Unsupported media type {}.".format(media_type)) click.echo( "Please request support at https://github.com/iscc/iscc-cli/issues" ) if media_type == "application/x-mobipocket-ebook": tempdir, epub_filepath = mobi.extract(file.name) tika_result = parser.from_file(epub_filepath) shutil.rmtree(tempdir) else: tika_result = parser.from_file(file.name) if not title: title = get_title(tika_result, guess=guess, uri=file.name) if not extra: extra = "" mid, norm_title, _ = iscc.meta_id(title, extra) gmt = mime_to_gmt(media_type, file_path=file.name) if gmt == GMT.IMAGE: cid = iscc.content_id_image(file.name) elif gmt == GMT.TEXT: text = tika_result["content"] if not text: click.echo("Could not extract text from {}".format(file.name)) return cid = iscc.content_id_text(tika_result["content"]) elif gmt == GMT.AUDIO: if not fpcalc.is_installed(): fpcalc.install() features = audio_id.get_chroma_vector(file.name) cid = audio_id.content_id_audio(features) elif gmt == GMT.VIDEO: features = video_id.get_frame_vectors(abspath(file.name)) cid = video_id.content_id_video(features) else: click.echo("Could not generate ISCC") return did = iscc.data_id(file.name) iid, tophash = iscc.instance_id(file.name) if not norm_title: iscc_code = "-".join((cid, did, iid)) else: iscc_code = "-".join((mid, cid, did, iid)) click.echo("ISCC:{}".format(iscc_code)) if verbose: if norm_title: click.echo("Norm Title: %s" % norm_title) click.echo("Tophash: %s" % tophash) click.echo("Filepath: %s" % file.name) click.echo("GMT: %s" % gmt) return dict(iscc=iscc_code, norm_title=norm_title, tophash=tophash, gmt=gmt)
def web(url, guess, title, extra, verbose): """Generate ISCC Code from URL.""" extra = extra or "" try: resp = requests.get(url, headers=HEADERS, stream=True) except Exception as e: raise click.BadArgumentUsage(e) data = BytesIO(resp.content) media_type = clean_mime(detector.from_buffer(data)) if media_type not in SUPPORTED_MIME_TYPES: click.echo("Unsupported media type {}".format(media_type)) click.echo( "Please request support at https://github.com/iscc/iscc-cli/issues" ) return if media_type == "application/x-mobipocket-ebook": data.seek(0) tempdir, filepath = mobi.extract(data) tika_result = parser.from_file(filepath) shutil.rmtree(tempdir) else: data.seek(0) tika_result = parser.from_buffer(data) if not title: title = get_title(tika_result, guess=guess, uri=url) mid, norm_title, _ = iscc.meta_id(title, extra) gmt = mime_to_gmt(media_type) if gmt == GMT.IMAGE: data.seek(0) cid = iscc.content_id_image(data) elif gmt == GMT.TEXT: text = tika_result["content"] if not text: click.echo("Could not extract text") return cid = iscc.content_id_text(tika_result["content"]) elif gmt == GMT.AUDIO: if not fpcalc.is_installed(): fpcalc.install() data.seek(0) features = audio_id.get_chroma_vector(data) cid = audio_id.content_id_audio(features) elif gmt == GMT.VIDEO: local_path = download_file(url, sanitize=True) features = video_id.get_frame_vectors(local_path) cid = video_id.content_id_video(features) os.remove(local_path) data.seek(0) did = iscc.data_id(data) data.seek(0) iid, tophash = iscc.instance_id(data) if not norm_title: iscc_code = "-".join((cid, did, iid)) else: iscc_code = "-".join((mid, cid, did, iid)) click.echo("ISCC:{}".format(iscc_code)) if verbose: if norm_title: click.echo("Norm Title: %s" % norm_title) click.echo("Tophash: %s" % tophash) click.echo("Filepath: %s" % url) click.echo("GMT: %s" % gmt) return dict(iscc=iscc_code, norm_title=norm_title, tophash=tophash, gmt=gmt)
def test_extract_file_like(): with open(join(TEST_DIR, "demo.mobi"), "rb") as infile: tempdir, filepath = mobi.extract(infile) assert exists(tempdir) assert exists(filepath) shutil.rmtree(tempdir)
from bs4 import BeautifulSoup from weasyprint import HTML, CSS, default_url_fetcher import mobi import shutil import sys if __name__ == "__main__": unzip_file_path=sys.argv[1] tempdir, filepath = mobi.extract(unzip_file_path) image_base=filepath[:-9] html=HTML(filename=filepath,base_url=image_base,encoding="utf8") filename=unzip_file_path.split("/")[-1] html.write_pdf(filename+'.pdf') shutil.rmtree(tempdir)
def from_file(file: UploadFile = File(...), title: str = Form(""), extra: str = Form("")): """Generate Full ISCC Code from Media File with optional explicit metadata.""" media_type = detector.from_buffer(file.file) if media_type not in SUPPORTED_MIME_TYPES: raise HTTPException( HTTP_415_UNSUPPORTED_MEDIA_TYPE, "Unsupported media type '{}'. Please request support at " "https://github.com/iscc/iscc-service/issues.".format(media_type), ) if media_type == "application/x-mobipocket-ebook": file.file.seek(0) tempdir, filepath = mobi.extract(file.file) tika_result = parser.from_file(filepath) shutil.rmtree(tempdir) else: file.file.seek(0) tika_result = parser.from_buffer(file.file) if not title: title = get_title(tika_result, guess=True) mid, norm_title, norm_extra = iscc.meta_id(title, extra) gmt = mime_to_gmt(media_type) if gmt == GMT.IMAGE: file.file.seek(0) cid = iscc.content_id_image(file.file) elif gmt == GMT.TEXT: text = tika_result["content"] if not text: raise HTTPException(HTTP_422_UNPROCESSABLE_ENTITY, "Could not extract text") cid = iscc.content_id_text(tika_result["content"]) elif gmt == GMT.AUDIO: file.file.seek(0) features = audio_id.get_chroma_vector(file.file) cid = audio_id.content_id_audio(features) elif gmt == GMT.VIDEO: file.file.seek(0) _, ext = splitext(file.filename) fn = "{}{}".format(uuid.uuid4(), ext) tmp_path = join(APP_DIR, fn) with open(tmp_path, "wb") as buffer: shutil.copyfileobj(file.file, buffer) features = video_id.get_frame_vectors(tmp_path) cid = video_id.content_id_video(features) os.remove(tmp_path) file.file.seek(0) did = iscc.data_id(file.file) file.file.seek(0) iid, tophash = iscc.instance_id(file.file) if not norm_title: iscc_code = "-".join((cid, did, iid)) else: iscc_code = "-".join((mid, cid, did, iid)) components = iscc_split(iscc_code) result = dict( iscc=iscc_code, tophash=tophash, gmt=gmt, bits=[code_to_bits(c) for c in components], ) if norm_title: result["title"] = title result["title_trimmed"] = norm_title if norm_extra: result["extra"] = extra result["extra_trimmed"] = norm_extra file.file.close() return result