def init(): """Inititalize and check environment.""" click.echo("Inititalizing Tika ...") detector.from_buffer(io.BytesIO(b"Wakeup Tika")) url = detector.ServerEndpoint + "/version" resp = requests.get(url) click.echo("Tika initialized: {}".format(resp.text)) click.echo("Testing fpcalc ...") fpc_ok = fpcalc.is_installed() if not fpc_ok: fpcalc.install() fpc_version = fpcalc.get_version_info() click.echo("fpcalc installed: {}".format(fpc_version))
def batch(path, recursive, guess, debug): """Create ISCC Codes for all files in PATH. Example: $ iscc batch ~/Documents """ if debug: log.add(sys.stdout) results = [] for f in get_files(path, recursive=recursive): filesize = os.path.getsize(f) if not filesize: msg = "Cannot proccess empty file: {}".format(f) log.warning(msg) continue media_type = mime_clean(mime_guess(f)) if media_type not in SUPPORTED_MIME_TYPES: fname = basename(f) msg = "Unsupported file {} with mime type: {},,,,".format( fname, media_type) log.warning(msg) continue if media_type == "application/x-mobipocket-ebook": try: tempdir, epub_filepath = mobi.extract(f) tika_result = parser.from_file(epub_filepath) shutil.rmtree(tempdir) except Exception as e: msg = "Error with mobi extraction %s" log.error(msg) continue else: tika_result = parser.from_file(f) title = get_title(tika_result, guess=guess, uri=f) mid, norm_title, _ = iscc.meta_id(title) gmt = mime_to_gmt(media_type, file_path=f) if gmt == GMT.IMAGE: try: cid = iscc.content_id_image(f) except Exception as e: msg = "Clould not proccess image: {} ({})".format(f, e) log.error(msg) continue elif gmt == GMT.TEXT: text = tika_result["content"] if not text: msg = "Could not extract text from {}".format(basename(f)) log.warning(msg) continue cid = iscc.content_id_text(tika_result["content"]) elif gmt == GMT.AUDIO: if not fpcalc.is_installed(): fpcalc.install() features = audio_id.get_chroma_vector(f) cid = audio_id.content_id_audio(features) elif gmt == GMT.VIDEO: features = video_id.get_frame_vectors(abspath(f)) cid = video_id.content_id_video(features) else: log.error("Could not generate ISCC") continue did = iscc.data_id(f) iid, tophash = iscc.instance_id(f) iscc_code_cs = ",".join((mid, cid, did, iid)) click.echo("{iscc_code},{tophash},{fname},{gmt},{title}".format( iscc_code=iscc_code_cs, tophash=tophash, fname=basename(f), gmt=gmt, title=norm_title, )) iscc_code = "-".join((mid, cid, did, iid)) results.append( dict( iscc=iscc_code, norm_title=norm_title, tophash=tophash, gmt=gmt, file_name=basename(f), )) return results
def gen(file, guess, title, extra, verbose): """Generate ISCC Code for FILE.""" filesize = os.path.getsize(file.name) if not filesize: raise click.BadParameter("Cannot proccess empty file: {}".format( file.name)) media_type = mime_clean(mime_guess(file.name)) if media_type not in SUPPORTED_MIME_TYPES: click.echo("Unsupported media type {}.".format(media_type)) click.echo( "Please request support at https://github.com/iscc/iscc-cli/issues" ) if media_type == "application/x-mobipocket-ebook": tempdir, epub_filepath = mobi.extract(file.name) tika_result = parser.from_file(epub_filepath) shutil.rmtree(tempdir) else: tika_result = parser.from_file(file.name) if not title: title = get_title(tika_result, guess=guess, uri=file.name) if not extra: extra = "" mid, norm_title, _ = iscc.meta_id(title, extra) gmt = mime_to_gmt(media_type, file_path=file.name) if gmt == GMT.IMAGE: cid = iscc.content_id_image(file.name) elif gmt == GMT.TEXT: text = tika_result["content"] if not text: click.echo("Could not extract text from {}".format(file.name)) return cid = iscc.content_id_text(tika_result["content"]) elif gmt == GMT.AUDIO: if not fpcalc.is_installed(): fpcalc.install() features = audio_id.get_chroma_vector(file.name) cid = audio_id.content_id_audio(features) elif gmt == GMT.VIDEO: features = video_id.get_frame_vectors(abspath(file.name)) cid = video_id.content_id_video(features) else: click.echo("Could not generate ISCC") return did = iscc.data_id(file.name) iid, tophash = iscc.instance_id(file.name) if not norm_title: iscc_code = "-".join((cid, did, iid)) else: iscc_code = "-".join((mid, cid, did, iid)) click.echo("ISCC:{}".format(iscc_code)) if verbose: if norm_title: click.echo("Norm Title: %s" % norm_title) click.echo("Tophash: %s" % tophash) click.echo("Filepath: %s" % file.name) click.echo("GMT: %s" % gmt) return dict(iscc=iscc_code, norm_title=norm_title, tophash=tophash, gmt=gmt)
def web(url, guess, title, extra, verbose): """Generate ISCC Code from URL.""" extra = extra or "" try: resp = requests.get(url, headers=HEADERS, stream=True) except Exception as e: raise click.BadArgumentUsage(e) data = BytesIO(resp.content) media_type = clean_mime(detector.from_buffer(data)) if media_type not in SUPPORTED_MIME_TYPES: click.echo("Unsupported media type {}".format(media_type)) click.echo( "Please request support at https://github.com/iscc/iscc-cli/issues" ) return if media_type == "application/x-mobipocket-ebook": data.seek(0) tempdir, filepath = mobi.extract(data) tika_result = parser.from_file(filepath) shutil.rmtree(tempdir) else: data.seek(0) tika_result = parser.from_buffer(data) if not title: title = get_title(tika_result, guess=guess, uri=url) mid, norm_title, _ = iscc.meta_id(title, extra) gmt = mime_to_gmt(media_type) if gmt == GMT.IMAGE: data.seek(0) cid = iscc.content_id_image(data) elif gmt == GMT.TEXT: text = tika_result["content"] if not text: click.echo("Could not extract text") return cid = iscc.content_id_text(tika_result["content"]) elif gmt == GMT.AUDIO: if not fpcalc.is_installed(): fpcalc.install() data.seek(0) features = audio_id.get_chroma_vector(data) cid = audio_id.content_id_audio(features) elif gmt == GMT.VIDEO: local_path = download_file(url, sanitize=True) features = video_id.get_frame_vectors(local_path) cid = video_id.content_id_video(features) os.remove(local_path) data.seek(0) did = iscc.data_id(data) data.seek(0) iid, tophash = iscc.instance_id(data) if not norm_title: iscc_code = "-".join((cid, did, iid)) else: iscc_code = "-".join((mid, cid, did, iid)) click.echo("ISCC:{}".format(iscc_code)) if verbose: if norm_title: click.echo("Norm Title: %s" % norm_title) click.echo("Tophash: %s" % tophash) click.echo("Filepath: %s" % url) click.echo("GMT: %s" % gmt) return dict(iscc=iscc_code, norm_title=norm_title, tophash=tophash, gmt=gmt)
def test_get_chroma_vector_file_path(): if not fpcalc.is_installed(): fpcalc.install() r = audio_id.get_chroma_vector("tests/audio/demo.mp3") assert isinstance(r, list) assert r == [ 684003877, 683946551, 1749295639, 2017796679, 2026256086, 2022066918, 2022001639, 2021968035, 2038741139, 2059709571, 503750851, 369541315, 320225426, 289292450, 830368930, 838789539, 1940835201, 1928186752, 1651297920, 1651283600, 1650959072, 1655022116, 1722069540, 1726259749, 1713694254, 1847914286, 1847912494, 1780832302, -362410962, -352973810, 1809196111, 1770397775, 1753686797, 683942429, 943989277, 943989255, 944121430, 952503910, 948374246, 948717799, 1485621411, 462203011, 508470403, 370053251, 303988867, 322879651, 322892963, 862907811, 1928256417, 1928317841, 1651297152, 1647091344, 1650827936, 1659216416, 1722069540, 1726263844, 1717887533, 1713696302, 1847912494, 1847883822, -366540754, -345633778, -336184242, 1771447375, 1753620815, 1757684255, 675553815, 943989255, 944120390, 952508006, 948308582, 948718050, 411879650, 428648578, 516861059, 370057347, 303988865, 306086033, 306086049, 841919649, 846133665, 1919929264, 1647168400, 1647101584, 1650827936, 1659216484, 1671733796, 1738838588, 1717887517, 1713696302, 1847913774, 1847879726, 1780960302, -362410978, -336196594, 1775641678, 1770397775, 1753555743, 683942429, 943989271, 944185926, 2026255094, 2022051494, 2021919654, ]
def test_install(): exe_path = fpcalc.install() assert os.path.exists(exe_path) assert fpcalc.is_installed()
def test_is_installed(): assert isinstance(fpcalc.is_installed(), bool)