Ejemplo n.º 1
0
def init():
    """Inititalize and check environment."""
    click.echo("Inititalizing Tika ...")
    detector.from_buffer(io.BytesIO(b"Wakeup Tika"))
    url = detector.ServerEndpoint + "/version"
    resp = requests.get(url)
    click.echo("Tika initialized: {}".format(resp.text))
    click.echo("Testing fpcalc ...")
    fpc_ok = fpcalc.is_installed()
    if not fpc_ok:
        fpcalc.install()
    fpc_version = fpcalc.get_version_info()
    click.echo("fpcalc installed: {}".format(fpc_version))
Ejemplo n.º 2
0
def batch(path, recursive, guess, debug):
    """Create ISCC Codes for all files in PATH.

    Example:

      $ iscc batch ~/Documents

    """
    if debug:
        log.add(sys.stdout)

    results = []
    for f in get_files(path, recursive=recursive):
        filesize = os.path.getsize(f)
        if not filesize:
            msg = "Cannot proccess empty file: {}".format(f)
            log.warning(msg)
            continue

        media_type = mime_clean(mime_guess(f))
        if media_type not in SUPPORTED_MIME_TYPES:
            fname = basename(f)
            msg = "Unsupported file {} with mime type: {},,,,".format(
                fname, media_type)
            log.warning(msg)
            continue

        if media_type == "application/x-mobipocket-ebook":
            try:
                tempdir, epub_filepath = mobi.extract(f)
                tika_result = parser.from_file(epub_filepath)
                shutil.rmtree(tempdir)
            except Exception as e:
                msg = "Error with mobi extraction %s"
                log.error(msg)
                continue
        else:
            tika_result = parser.from_file(f)

        title = get_title(tika_result, guess=guess, uri=f)

        mid, norm_title, _ = iscc.meta_id(title)
        gmt = mime_to_gmt(media_type, file_path=f)
        if gmt == GMT.IMAGE:
            try:
                cid = iscc.content_id_image(f)
            except Exception as e:
                msg = "Clould not proccess image: {} ({})".format(f, e)
                log.error(msg)
                continue

        elif gmt == GMT.TEXT:
            text = tika_result["content"]
            if not text:
                msg = "Could not extract text from {}".format(basename(f))
                log.warning(msg)
                continue
            cid = iscc.content_id_text(tika_result["content"])
        elif gmt == GMT.AUDIO:
            if not fpcalc.is_installed():
                fpcalc.install()
            features = audio_id.get_chroma_vector(f)
            cid = audio_id.content_id_audio(features)
        elif gmt == GMT.VIDEO:
            features = video_id.get_frame_vectors(abspath(f))
            cid = video_id.content_id_video(features)
        else:
            log.error("Could not generate ISCC")
            continue

        did = iscc.data_id(f)
        iid, tophash = iscc.instance_id(f)

        iscc_code_cs = ",".join((mid, cid, did, iid))

        click.echo("{iscc_code},{tophash},{fname},{gmt},{title}".format(
            iscc_code=iscc_code_cs,
            tophash=tophash,
            fname=basename(f),
            gmt=gmt,
            title=norm_title,
        ))
        iscc_code = "-".join((mid, cid, did, iid))
        results.append(
            dict(
                iscc=iscc_code,
                norm_title=norm_title,
                tophash=tophash,
                gmt=gmt,
                file_name=basename(f),
            ))

    return results
Ejemplo n.º 3
0
def gen(file, guess, title, extra, verbose):
    """Generate ISCC Code for FILE."""
    filesize = os.path.getsize(file.name)
    if not filesize:
        raise click.BadParameter("Cannot proccess empty file: {}".format(
            file.name))

    media_type = mime_clean(mime_guess(file.name))
    if media_type not in SUPPORTED_MIME_TYPES:
        click.echo("Unsupported media type {}.".format(media_type))
        click.echo(
            "Please request support at https://github.com/iscc/iscc-cli/issues"
        )

    if media_type == "application/x-mobipocket-ebook":
        tempdir, epub_filepath = mobi.extract(file.name)
        tika_result = parser.from_file(epub_filepath)
        shutil.rmtree(tempdir)
    else:
        tika_result = parser.from_file(file.name)

    if not title:
        title = get_title(tika_result, guess=guess, uri=file.name)

    if not extra:
        extra = ""

    mid, norm_title, _ = iscc.meta_id(title, extra)
    gmt = mime_to_gmt(media_type, file_path=file.name)
    if gmt == GMT.IMAGE:
        cid = iscc.content_id_image(file.name)
    elif gmt == GMT.TEXT:
        text = tika_result["content"]
        if not text:
            click.echo("Could not extract text from {}".format(file.name))
            return
        cid = iscc.content_id_text(tika_result["content"])
    elif gmt == GMT.AUDIO:
        if not fpcalc.is_installed():
            fpcalc.install()
        features = audio_id.get_chroma_vector(file.name)
        cid = audio_id.content_id_audio(features)
    elif gmt == GMT.VIDEO:
        features = video_id.get_frame_vectors(abspath(file.name))
        cid = video_id.content_id_video(features)
    else:
        click.echo("Could not generate ISCC")
        return

    did = iscc.data_id(file.name)
    iid, tophash = iscc.instance_id(file.name)

    if not norm_title:
        iscc_code = "-".join((cid, did, iid))
    else:
        iscc_code = "-".join((mid, cid, did, iid))

    click.echo("ISCC:{}".format(iscc_code))

    if verbose:
        if norm_title:
            click.echo("Norm Title: %s" % norm_title)
        click.echo("Tophash:    %s" % tophash)
        click.echo("Filepath:   %s" % file.name)
        click.echo("GMT:        %s" % gmt)

    return dict(iscc=iscc_code,
                norm_title=norm_title,
                tophash=tophash,
                gmt=gmt)
Ejemplo n.º 4
0
def web(url, guess, title, extra, verbose):
    """Generate ISCC Code from URL."""

    extra = extra or ""

    try:
        resp = requests.get(url, headers=HEADERS, stream=True)
    except Exception as e:
        raise click.BadArgumentUsage(e)

    data = BytesIO(resp.content)
    media_type = clean_mime(detector.from_buffer(data))
    if media_type not in SUPPORTED_MIME_TYPES:
        click.echo("Unsupported media type {}".format(media_type))
        click.echo(
            "Please request support at https://github.com/iscc/iscc-cli/issues"
        )
        return

    if media_type == "application/x-mobipocket-ebook":
        data.seek(0)
        tempdir, filepath = mobi.extract(data)
        tika_result = parser.from_file(filepath)
        shutil.rmtree(tempdir)
    else:
        data.seek(0)
        tika_result = parser.from_buffer(data)

    if not title:
        title = get_title(tika_result, guess=guess, uri=url)

    mid, norm_title, _ = iscc.meta_id(title, extra)
    gmt = mime_to_gmt(media_type)
    if gmt == GMT.IMAGE:
        data.seek(0)
        cid = iscc.content_id_image(data)
    elif gmt == GMT.TEXT:
        text = tika_result["content"]
        if not text:
            click.echo("Could not extract text")
            return
        cid = iscc.content_id_text(tika_result["content"])
    elif gmt == GMT.AUDIO:
        if not fpcalc.is_installed():
            fpcalc.install()
        data.seek(0)
        features = audio_id.get_chroma_vector(data)
        cid = audio_id.content_id_audio(features)
    elif gmt == GMT.VIDEO:
        local_path = download_file(url, sanitize=True)
        features = video_id.get_frame_vectors(local_path)
        cid = video_id.content_id_video(features)
        os.remove(local_path)

    data.seek(0)
    did = iscc.data_id(data)
    data.seek(0)
    iid, tophash = iscc.instance_id(data)

    if not norm_title:
        iscc_code = "-".join((cid, did, iid))
    else:
        iscc_code = "-".join((mid, cid, did, iid))

    click.echo("ISCC:{}".format(iscc_code))

    if verbose:
        if norm_title:
            click.echo("Norm Title: %s" % norm_title)
        click.echo("Tophash:    %s" % tophash)
        click.echo("Filepath:   %s" % url)
        click.echo("GMT:        %s" % gmt)

    return dict(iscc=iscc_code,
                norm_title=norm_title,
                tophash=tophash,
                gmt=gmt)
Ejemplo n.º 5
0
def test_get_chroma_vector_file_path():
    if not fpcalc.is_installed():
        fpcalc.install()
    r = audio_id.get_chroma_vector("tests/audio/demo.mp3")
    assert isinstance(r, list)
    assert r == [
        684003877,
        683946551,
        1749295639,
        2017796679,
        2026256086,
        2022066918,
        2022001639,
        2021968035,
        2038741139,
        2059709571,
        503750851,
        369541315,
        320225426,
        289292450,
        830368930,
        838789539,
        1940835201,
        1928186752,
        1651297920,
        1651283600,
        1650959072,
        1655022116,
        1722069540,
        1726259749,
        1713694254,
        1847914286,
        1847912494,
        1780832302,
        -362410962,
        -352973810,
        1809196111,
        1770397775,
        1753686797,
        683942429,
        943989277,
        943989255,
        944121430,
        952503910,
        948374246,
        948717799,
        1485621411,
        462203011,
        508470403,
        370053251,
        303988867,
        322879651,
        322892963,
        862907811,
        1928256417,
        1928317841,
        1651297152,
        1647091344,
        1650827936,
        1659216416,
        1722069540,
        1726263844,
        1717887533,
        1713696302,
        1847912494,
        1847883822,
        -366540754,
        -345633778,
        -336184242,
        1771447375,
        1753620815,
        1757684255,
        675553815,
        943989255,
        944120390,
        952508006,
        948308582,
        948718050,
        411879650,
        428648578,
        516861059,
        370057347,
        303988865,
        306086033,
        306086049,
        841919649,
        846133665,
        1919929264,
        1647168400,
        1647101584,
        1650827936,
        1659216484,
        1671733796,
        1738838588,
        1717887517,
        1713696302,
        1847913774,
        1847879726,
        1780960302,
        -362410978,
        -336196594,
        1775641678,
        1770397775,
        1753555743,
        683942429,
        943989271,
        944185926,
        2026255094,
        2022051494,
        2021919654,
    ]
Ejemplo n.º 6
0
def test_install():
    exe_path = fpcalc.install()
    assert os.path.exists(exe_path)
    assert fpcalc.is_installed()
Ejemplo n.º 7
0
def test_is_installed():
    assert isinstance(fpcalc.is_installed(), bool)