def test_iscc_from_url_no_meta(): url = "https://github.com/iscc/iscc-cli/raw/master/tests/image/demo.png" res = lib.iscc_from_url(url) assert isinstance(res, dict) assert "CYDfTq7Qc7Fre-CDij3vGU1BkCZ-CRNssh4Qc1x5B" in res["iscc"] meta_id, _, _ = iscc.meta_id("demo") assert meta_id in res["iscc"]
def save(self, *args, **kwargs): mid, title, extra = iscc.meta_id(self.title, self.extra) if self.ident: new_ident = [mid] + list(self.ident.split('-')[1:]) self.ident = '-'.join(new_ident) if self.file: new_upload = isinstance(self.file.file, UploadedFile) if new_upload: # Generate ISCC filename, file_extension = splitext(self.file.name) ext = file_extension.lower().lstrip('.') data = self.file.open('rb').read() if ext in self.TEXT_EXTENSIONS: if ext == 'docx': text = docx2txt.process(BytesIO(data)) print(text) else: text = self.file.open().read() cid = iscc.content_id_text(text) elif ext in self.IMAGE_EXTENSIONS: cid = iscc.content_id_image(BytesIO(data)) did = iscc.data_id(data) iid, self.tophash = iscc.instance_id(data) iscc_code = '-'.join((mid, cid, did, iid)) self.ident = iscc_code super(MediaContent, self).save(*args, **kwargs)
def test_hamming_distance(): a = 0b0001111 b = 0b1000111 assert iscc.distance(a, b) == 2 mid1 = iscc.meta_id("Die Unendliche Geschichte", "von Michael Ende")[0] # Change one Character mid2 = iscc.meta_id("Die UnXndliche Geschichte", "von Michael Ende")[0] assert iscc.distance(mid1, mid2) <= 10 # Delete one Character mid2 = iscc.meta_id("Die nendliche Geschichte", "von Michael Ende")[0] assert iscc.distance(mid1, mid2) <= 14 # Add one Character mid2 = iscc.meta_id("Die UnendlicheX Geschichte", "von Michael Ende")[0] assert iscc.distance(mid1, mid2) <= 13 # Add, change, delete mid2 = iscc.meta_id("Diex Unandlische Geschiche", "von Michael Ende")[0] assert iscc.distance(mid1, mid2) <= 22 # Change Word order mid2 = iscc.meta_id("Unendliche Geschichte, Die", "von Michael Ende")[0] assert iscc.distance(mid1, mid2) <= 13 # Totaly different mid2 = iscc.meta_id("Now for something different")[0] assert iscc.distance(mid1, mid2) >= 24
def meta_changed(self): title = self.edit_title.text() extra = self.edit_extra.text() mid, tf, ef = iscc.meta_id(title=title, extra=extra) self.meta_id = mid self.title_formatted = tf self.extra_formatted = ef if self.content_id: self.show_conflicts()
def site_iscc(): title = "ISCC - Content Identifiers" text = get_content('text') data = get_content('data') mid, title, extra = iscc.meta_id(title) cidt = iscc.content_id_text(text) did = iscc.data_id(data) iid, hash_ = iscc.instance_id(data) code = '-'.join((mid, cidt, did, iid)) print('SITE:') print('TITLE:', title, extra) print('ISCC:', code) print('IIDF:', hash_)
def spec_iscc(): title = "ISCC - Specification" text = open('docs/specification.md', encoding='utf-8').read() data = open('docs/specification.md', 'rb').read() mid, title, extra = iscc.meta_id(title) cidt = iscc.content_id_text(text) did = iscc.data_id(data) iid, hash_ = iscc.instance_id(data) code = '-'.join((mid, cidt, did, iid)) print('SPEC:') print('TITLE:', title, extra) print('ISCC:', code) print('IIDF:', hash_)
def site_iscc(): title = "ISCC - Content Identifiers" text = get_content("text") data = get_content("data") mid, title, extra = iscc.meta_id(title) cidt = iscc.content_id_text(text) did = iscc.data_id(data) iid, hash_ = iscc.instance_id(data) code = "-".join((mid, cidt, did, iid)) print("SITE:") print("TITLE:", title, extra) print("ISCC:", code) print("IIDF:", hash_)
def spec_iscc(): title = "ISCC - Specification" text = open(join(PROJECT_DIR, "docs/specification.md"), encoding="utf-8").read() data = open(join(PROJECT_DIR, "docs/specification.md"), "rb").read() mid, title, extra = iscc.meta_id(title) cidt = iscc.content_id_text(text) did = iscc.data_id(data) iid, hash_ = iscc.instance_id(data) code = "-".join((mid, cidt, did, iid)) print("SPEC:") print("TITLE:", title, extra) print("ISCC:", code) print("IIDF:", hash_)
def meta_id(meta: Metadata): """Generate MetaID from 'title' and optional 'extra' metadata""" extra = meta.extra or "" mid, title_trimmed, extra_trimmed = iscc.meta_id(meta.title, extra) result = { "code": mid, "bits": code_to_bits(mid), "ident": code_to_int(mid), "title": meta.title, "title_trimmed": title_trimmed, } if extra: result["extra"] = extra result["extra_trimmed"] = extra_trimmed return result
def test_meta_id(): mid1, _, _ = iscc.meta_id("ISCC Content Identifiers") assert mid1 == "CCDFPFc87MhdT" mid1, _, _ = iscc.meta_id(b"ISCC Content Identifiers") assert mid1 == "CCDFPFc87MhdT" mid1, title, extra = iscc.meta_id("Die Unendliche Geschichte") assert mid1 == "CCAKevDpE1eEL" assert title == "die unendliche geschichte" assert extra == "" mid2 = iscc.meta_id(" Die unéndlÃche, Geschichte ")[0] assert mid1 == mid2 mid3 = iscc.meta_id("Die Unentliche Geschichte")[0] assert iscc.distance(mid1, mid3) == 8 mid4 = iscc.meta_id("Geschichte, Die Unendliche")[0] assert iscc.distance(mid1, mid4) == 9 with pytest.raises(UnicodeDecodeError): iscc.meta_id(b"\xc3\x28")
def test_meta_id(): mid1, _, _ = iscc.meta_id('ISCC Content Identifiers') assert mid1 == 'CCDGhLx6tREif' mid1, _, _ = iscc.meta_id(b'ISCC Content Identifiers') assert mid1 == 'CCDGhLx6tREif' mid1, title, extra = iscc.meta_id('Die Unendliche Geschichte') assert mid1 == "CCAZF4K1bBv8i" assert title == 'die unendliche geschichte' assert extra == '' mid2 = iscc.meta_id(' Die unéndlÃche, Geschichte ')[0] assert mid1 == mid2 mid3 = iscc.meta_id('Die Unentliche Geschichte')[0] assert iscc.distance(mid1, mid3) == 12 mid4 = iscc.meta_id('Geschichte, Die Unendliche')[0] assert iscc.distance(mid1, mid4) == 7 with pytest.raises(UnicodeDecodeError): iscc.meta_id(b"\xc3\x28")
def batch(path, recursive, guess, debug): """Create ISCC Codes for all files in PATH. Example: $ iscc batch ~/Documents """ if debug: log.add(sys.stdout) results = [] for f in get_files(path, recursive=recursive): filesize = os.path.getsize(f) if not filesize: msg = "Cannot proccess empty file: {}".format(f) log.warning(msg) continue media_type = mime_clean(mime_guess(f)) if media_type not in SUPPORTED_MIME_TYPES: fname = basename(f) msg = "Unsupported file {} with mime type: {},,,,".format( fname, media_type) log.warning(msg) continue if media_type == "application/x-mobipocket-ebook": try: tempdir, epub_filepath = mobi.extract(f) tika_result = parser.from_file(epub_filepath) shutil.rmtree(tempdir) except Exception as e: msg = "Error with mobi extraction %s" log.error(msg) continue else: tika_result = parser.from_file(f) title = get_title(tika_result, guess=guess, uri=f) mid, norm_title, _ = iscc.meta_id(title) gmt = mime_to_gmt(media_type, file_path=f) if gmt == GMT.IMAGE: try: cid = iscc.content_id_image(f) except Exception as e: msg = "Clould not proccess image: {} ({})".format(f, e) log.error(msg) continue elif gmt == GMT.TEXT: text = tika_result["content"] if not text: msg = "Could not extract text from {}".format(basename(f)) log.warning(msg) continue cid = iscc.content_id_text(tika_result["content"]) elif gmt == GMT.AUDIO: if not fpcalc.is_installed(): fpcalc.install() features = audio_id.get_chroma_vector(f) cid = audio_id.content_id_audio(features) elif gmt == GMT.VIDEO: features = video_id.get_frame_vectors(abspath(f)) cid = video_id.content_id_video(features) else: log.error("Could not generate ISCC") continue did = iscc.data_id(f) iid, tophash = iscc.instance_id(f) iscc_code_cs = ",".join((mid, cid, did, iid)) click.echo("{iscc_code},{tophash},{fname},{gmt},{title}".format( iscc_code=iscc_code_cs, tophash=tophash, fname=basename(f), gmt=gmt, title=norm_title, )) iscc_code = "-".join((mid, cid, did, iid)) results.append( dict( iscc=iscc_code, norm_title=norm_title, tophash=tophash, gmt=gmt, file_name=basename(f), )) return results
def gen(file, guess, title, extra, verbose): """Generate ISCC Code for FILE.""" filesize = os.path.getsize(file.name) if not filesize: raise click.BadParameter("Cannot proccess empty file: {}".format( file.name)) media_type = mime_clean(mime_guess(file.name)) if media_type not in SUPPORTED_MIME_TYPES: click.echo("Unsupported media type {}.".format(media_type)) click.echo( "Please request support at https://github.com/iscc/iscc-cli/issues" ) if media_type == "application/x-mobipocket-ebook": tempdir, epub_filepath = mobi.extract(file.name) tika_result = parser.from_file(epub_filepath) shutil.rmtree(tempdir) else: tika_result = parser.from_file(file.name) if not title: title = get_title(tika_result, guess=guess, uri=file.name) if not extra: extra = "" mid, norm_title, _ = iscc.meta_id(title, extra) gmt = mime_to_gmt(media_type, file_path=file.name) if gmt == GMT.IMAGE: cid = iscc.content_id_image(file.name) elif gmt == GMT.TEXT: text = tika_result["content"] if not text: click.echo("Could not extract text from {}".format(file.name)) return cid = iscc.content_id_text(tika_result["content"]) elif gmt == GMT.AUDIO: if not fpcalc.is_installed(): fpcalc.install() features = audio_id.get_chroma_vector(file.name) cid = audio_id.content_id_audio(features) elif gmt == GMT.VIDEO: features = video_id.get_frame_vectors(abspath(file.name)) cid = video_id.content_id_video(features) else: click.echo("Could not generate ISCC") return did = iscc.data_id(file.name) iid, tophash = iscc.instance_id(file.name) if not norm_title: iscc_code = "-".join((cid, did, iid)) else: iscc_code = "-".join((mid, cid, did, iid)) click.echo("ISCC:{}".format(iscc_code)) if verbose: if norm_title: click.echo("Norm Title: %s" % norm_title) click.echo("Tophash: %s" % tophash) click.echo("Filepath: %s" % file.name) click.echo("GMT: %s" % gmt) return dict(iscc=iscc_code, norm_title=norm_title, tophash=tophash, gmt=gmt)
def web(url, guess, title, extra, verbose): """Generate ISCC Code from URL.""" extra = extra or "" try: resp = requests.get(url, headers=HEADERS, stream=True) except Exception as e: raise click.BadArgumentUsage(e) data = BytesIO(resp.content) media_type = clean_mime(detector.from_buffer(data)) if media_type not in SUPPORTED_MIME_TYPES: click.echo("Unsupported media type {}".format(media_type)) click.echo( "Please request support at https://github.com/iscc/iscc-cli/issues" ) return if media_type == "application/x-mobipocket-ebook": data.seek(0) tempdir, filepath = mobi.extract(data) tika_result = parser.from_file(filepath) shutil.rmtree(tempdir) else: data.seek(0) tika_result = parser.from_buffer(data) if not title: title = get_title(tika_result, guess=guess, uri=url) mid, norm_title, _ = iscc.meta_id(title, extra) gmt = mime_to_gmt(media_type) if gmt == GMT.IMAGE: data.seek(0) cid = iscc.content_id_image(data) elif gmt == GMT.TEXT: text = tika_result["content"] if not text: click.echo("Could not extract text") return cid = iscc.content_id_text(tika_result["content"]) elif gmt == GMT.AUDIO: if not fpcalc.is_installed(): fpcalc.install() data.seek(0) features = audio_id.get_chroma_vector(data) cid = audio_id.content_id_audio(features) elif gmt == GMT.VIDEO: local_path = download_file(url, sanitize=True) features = video_id.get_frame_vectors(local_path) cid = video_id.content_id_video(features) os.remove(local_path) data.seek(0) did = iscc.data_id(data) data.seek(0) iid, tophash = iscc.instance_id(data) if not norm_title: iscc_code = "-".join((cid, did, iid)) else: iscc_code = "-".join((mid, cid, did, iid)) click.echo("ISCC:{}".format(iscc_code)) if verbose: if norm_title: click.echo("Norm Title: %s" % norm_title) click.echo("Tophash: %s" % tophash) click.echo("Filepath: %s" % url) click.echo("GMT: %s" % gmt) return dict(iscc=iscc_code, norm_title=norm_title, tophash=tophash, gmt=gmt)
def from_file(file: UploadFile = File(...), title: str = Form(""), extra: str = Form("")): """Generate Full ISCC Code from Media File with optional explicit metadata.""" media_type = detector.from_buffer(file.file) if media_type not in SUPPORTED_MIME_TYPES: raise HTTPException( HTTP_415_UNSUPPORTED_MEDIA_TYPE, "Unsupported media type '{}'. Please request support at " "https://github.com/iscc/iscc-service/issues.".format(media_type), ) if media_type == "application/x-mobipocket-ebook": file.file.seek(0) tempdir, filepath = mobi.extract(file.file) tika_result = parser.from_file(filepath) shutil.rmtree(tempdir) else: file.file.seek(0) tika_result = parser.from_buffer(file.file) if not title: title = get_title(tika_result, guess=True) mid, norm_title, norm_extra = iscc.meta_id(title, extra) gmt = mime_to_gmt(media_type) if gmt == GMT.IMAGE: file.file.seek(0) cid = iscc.content_id_image(file.file) elif gmt == GMT.TEXT: text = tika_result["content"] if not text: raise HTTPException(HTTP_422_UNPROCESSABLE_ENTITY, "Could not extract text") cid = iscc.content_id_text(tika_result["content"]) elif gmt == GMT.AUDIO: file.file.seek(0) features = audio_id.get_chroma_vector(file.file) cid = audio_id.content_id_audio(features) elif gmt == GMT.VIDEO: file.file.seek(0) _, ext = splitext(file.filename) fn = "{}{}".format(uuid.uuid4(), ext) tmp_path = join(APP_DIR, fn) with open(tmp_path, "wb") as buffer: shutil.copyfileobj(file.file, buffer) features = video_id.get_frame_vectors(tmp_path) cid = video_id.content_id_video(features) os.remove(tmp_path) file.file.seek(0) did = iscc.data_id(file.file) file.file.seek(0) iid, tophash = iscc.instance_id(file.file) if not norm_title: iscc_code = "-".join((cid, did, iid)) else: iscc_code = "-".join((mid, cid, did, iid)) components = iscc_split(iscc_code) result = dict( iscc=iscc_code, tophash=tophash, gmt=gmt, bits=[code_to_bits(c) for c in components], ) if norm_title: result["title"] = title result["title_trimmed"] = norm_title if norm_extra: result["extra"] = extra result["extra_trimmed"] = norm_extra file.file.close() return result