Example #1
0
def dump(path, strip, meta, content):
    """Dump Tika extraction results for PATH (file or url path)."""

    media_type = mime_clean(mime_guess(path))

    if media_type not in SUPPORTED_MIME_TYPES:
        click.echo("Unsupported media type {}.".format(media_type))
        click.echo("Please request support at https://github.com/iscc/iscc-cli/issues")

    if media_type == "application/x-mobipocket-ebook":
        tempdir, epub_filepath = mobi.extract(path)
        tika_result = parser.from_file(epub_filepath)
        shutil.rmtree(tempdir)
    else:
        tika_result = parser.from_file(path)

    if all([meta, content]):
        raise UsageError("Use either --meta or --content for selective output.")

    if strip:
        tika_result["content"] = tika_result.get("content", "")[:strip]

    if meta:
        click.echo(json.dumps(tika_result.get("metadata", ""), indent=2))
    elif content:
        click.echo(json.dumps(tika_result.get("content", ""), indent=2))
    else:
        click.echo(json.dumps(tika_result, indent=2))
Example #2
0
def extract_mobi_folder(bookdir, force=False):
    # used lists
    # mobi extracts the mobis into some temp dicts and this list will hold
    # the paths
    templist = []
    # convlist will be holding directionary for after the conversion
    convlist = []

    # create list of all mobis inside the bookdir
    mobilist = [
        f.path for f in os.scandir(bookdir)
        if f.is_file() and os.path.splitext(f)[1] in (
            ".mobi", ) and not os.path.isdir(os.path.splitext(f)[0])
    ]
    if force:
        mobilist = glob(bookdir + "/*.mobi")
    # extract the mobis
    for f in mobilist:
        tempdir, _ = mobi.extract(f)
        templist.append(tempdir + "\\mobi7")
    # dictiorary names after conversion is just the filename
    # minus the extension
    for f in mobilist:
        convlist.append(os.path.splitext(f)[0])
    # copy over the mobi file structure
    for i in range(len(templist)):
        copy_delcopy(templist[i], convlist[i])
    # clean up
    for f in templist:
        shutil.rmtree(os.path.dirname(f))
    return convlist
Example #3
0
def test_extract():
    for fname in os.listdir(TEST_DIR):
        ext = splitext(fname)[-1].upper()
        if ext in [".MOBI", ".PRC", ".AZW", ".AZW3", ".AZW4"]:
            tempdir, filepath = mobi.extract(join(TEST_DIR, fname))
            assert exists(tempdir)
            assert exists(filepath)
            shutil.rmtree(tempdir)
Example #4
0
def main(argv):
    input = sys.argv[1]
    if len(sys.argv) <= 2:
        output = input.replace('.mobi', '_unpacked_mobi')
    else:
        output = sys.argv[2]

    print ('Unpacking MOBI to {}'.format(output))
    tempdir, _ = mobi.extract(input)
    shutil.move(tempdir, output)
Example #5
0
 def unpack_mobi(self, filename):
     storage_area = self.get_mobi_storage_area()
     filemd5 = generate_file_md5(filename)
     for fname in storage_area.iterdir():
         if fname.is_file() and (fname.stem == filemd5):
             return str(fname)
     with mute_stdout():
         tempdir, extracted_file = mobi.extract(str(filename))
     filetype = Path(extracted_file).suffix.strip(".")
     if filetype == "html":
         return self.create_valid_epub_from_epub_like_structure(
             tempdir, storage_area.joinpath(f"{filemd5}.epub"))
     dst_filename = storage_area.joinpath(f"{filemd5}.{filetype}")
     shutil.copy(extracted_file, dst_filename)
     TemporaryDirectory._rmtree(tempdir)
     return dst_filename
Example #6
0
def parse(dir, outPath):
    count = 0
    for root, dirs, files in os.walk(dir):

        for file in files:
            try:
                #获取文件路径
                path = os.path.join(root, file)
                # print("文件路径为"+path)
                if path.endswith(".mobi"):
                    tempdir, filepath = mobi.extract(path)

                    # filepath里面是一个html
                    book = open(filepath, 'r', encoding='utf-8')
                    soup = BeautifulSoup(book.read(), "html.parser")

                    # 解析好的文件路径
                    filename = path.split("/")[-1]
                    currentPath = os.getcwd()
                    # print("当前路径为"+currentPath)
                    # savePath = join(currentPath,"out",filename+".txt")
                    savePath = join(outPath, filename + ".txt")
                    # print("解析好的文件路径"+savePath)
                    if os.path.exists(outPath):
                        pass
                    else:
                        os.makedirs(join(currentPath, "out"))
                    a = open(savePath, 'w', encoding='utf-8')
                    a.write(soup.text)
                    a.close()
                    # print("删除临时目录"+tempdir)
                    shutil.rmtree(tempdir)
                    count = count + 1

                    if count % 10 == 0:
                        print("处理了" + str(count))
            except Exception as a:
                print(a)
                traceback.print_exc()
                continue
Example #7
0
    def import_mobi_file(self, text_file_path):
        tempdir, filepath = mobi.extract(text_file_path)

        # If extracted MOBI file has extension TXT or HTML that means that everything worked properly.
        if re.search(r'\S+.txt|\S+.html', filepath, re.IGNORECASE):
            file = open(filepath, 'r', errors='ignore')
            content = file.read()
            new_text = html2text.html2text(content.replace('\\n', ''))

            self.save_temp_data(new_text)

            shutil.rmtree(tempdir, ignore_errors=True)
        # In other case (for example extracted file has EPUB format) that means that MOBI file was encrypted and
        # content will be corrupted.
        else:
            self.text_loading_dialog.dismiss()
            self.show_instructions(
                'Something went wrong :( The file provided cannot be processed. Please try another one.'
            )

        self.update_text_preview()
        self.text_loading_dialog.dismiss()
Example #8
0
def batch(path, recursive, guess, debug):
    """Create ISCC Codes for all files in PATH.

    Example:

      $ iscc batch ~/Documents

    """
    if debug:
        log.add(sys.stdout)

    results = []
    for f in get_files(path, recursive=recursive):
        filesize = os.path.getsize(f)
        if not filesize:
            msg = "Cannot proccess empty file: {}".format(f)
            log.warning(msg)
            continue

        media_type = mime_clean(mime_guess(f))
        if media_type not in SUPPORTED_MIME_TYPES:
            fname = basename(f)
            msg = "Unsupported file {} with mime type: {},,,,".format(
                fname, media_type)
            log.warning(msg)
            continue

        if media_type == "application/x-mobipocket-ebook":
            try:
                tempdir, epub_filepath = mobi.extract(f)
                tika_result = parser.from_file(epub_filepath)
                shutil.rmtree(tempdir)
            except Exception as e:
                msg = "Error with mobi extraction %s"
                log.error(msg)
                continue
        else:
            tika_result = parser.from_file(f)

        title = get_title(tika_result, guess=guess, uri=f)

        mid, norm_title, _ = iscc.meta_id(title)
        gmt = mime_to_gmt(media_type, file_path=f)
        if gmt == GMT.IMAGE:
            try:
                cid = iscc.content_id_image(f)
            except Exception as e:
                msg = "Clould not proccess image: {} ({})".format(f, e)
                log.error(msg)
                continue

        elif gmt == GMT.TEXT:
            text = tika_result["content"]
            if not text:
                msg = "Could not extract text from {}".format(basename(f))
                log.warning(msg)
                continue
            cid = iscc.content_id_text(tika_result["content"])
        elif gmt == GMT.AUDIO:
            if not fpcalc.is_installed():
                fpcalc.install()
            features = audio_id.get_chroma_vector(f)
            cid = audio_id.content_id_audio(features)
        elif gmt == GMT.VIDEO:
            features = video_id.get_frame_vectors(abspath(f))
            cid = video_id.content_id_video(features)
        else:
            log.error("Could not generate ISCC")
            continue

        did = iscc.data_id(f)
        iid, tophash = iscc.instance_id(f)

        iscc_code_cs = ",".join((mid, cid, did, iid))

        click.echo("{iscc_code},{tophash},{fname},{gmt},{title}".format(
            iscc_code=iscc_code_cs,
            tophash=tophash,
            fname=basename(f),
            gmt=gmt,
            title=norm_title,
        ))
        iscc_code = "-".join((mid, cid, did, iid))
        results.append(
            dict(
                iscc=iscc_code,
                norm_title=norm_title,
                tophash=tophash,
                gmt=gmt,
                file_name=basename(f),
            ))

    return results
Example #9
0
def gen(file, guess, title, extra, verbose):
    """Generate ISCC Code for FILE."""
    filesize = os.path.getsize(file.name)
    if not filesize:
        raise click.BadParameter("Cannot proccess empty file: {}".format(
            file.name))

    media_type = mime_clean(mime_guess(file.name))
    if media_type not in SUPPORTED_MIME_TYPES:
        click.echo("Unsupported media type {}.".format(media_type))
        click.echo(
            "Please request support at https://github.com/iscc/iscc-cli/issues"
        )

    if media_type == "application/x-mobipocket-ebook":
        tempdir, epub_filepath = mobi.extract(file.name)
        tika_result = parser.from_file(epub_filepath)
        shutil.rmtree(tempdir)
    else:
        tika_result = parser.from_file(file.name)

    if not title:
        title = get_title(tika_result, guess=guess, uri=file.name)

    if not extra:
        extra = ""

    mid, norm_title, _ = iscc.meta_id(title, extra)
    gmt = mime_to_gmt(media_type, file_path=file.name)
    if gmt == GMT.IMAGE:
        cid = iscc.content_id_image(file.name)
    elif gmt == GMT.TEXT:
        text = tika_result["content"]
        if not text:
            click.echo("Could not extract text from {}".format(file.name))
            return
        cid = iscc.content_id_text(tika_result["content"])
    elif gmt == GMT.AUDIO:
        if not fpcalc.is_installed():
            fpcalc.install()
        features = audio_id.get_chroma_vector(file.name)
        cid = audio_id.content_id_audio(features)
    elif gmt == GMT.VIDEO:
        features = video_id.get_frame_vectors(abspath(file.name))
        cid = video_id.content_id_video(features)
    else:
        click.echo("Could not generate ISCC")
        return

    did = iscc.data_id(file.name)
    iid, tophash = iscc.instance_id(file.name)

    if not norm_title:
        iscc_code = "-".join((cid, did, iid))
    else:
        iscc_code = "-".join((mid, cid, did, iid))

    click.echo("ISCC:{}".format(iscc_code))

    if verbose:
        if norm_title:
            click.echo("Norm Title: %s" % norm_title)
        click.echo("Tophash:    %s" % tophash)
        click.echo("Filepath:   %s" % file.name)
        click.echo("GMT:        %s" % gmt)

    return dict(iscc=iscc_code,
                norm_title=norm_title,
                tophash=tophash,
                gmt=gmt)
Example #10
0
def web(url, guess, title, extra, verbose):
    """Generate ISCC Code from URL."""

    extra = extra or ""

    try:
        resp = requests.get(url, headers=HEADERS, stream=True)
    except Exception as e:
        raise click.BadArgumentUsage(e)

    data = BytesIO(resp.content)
    media_type = clean_mime(detector.from_buffer(data))
    if media_type not in SUPPORTED_MIME_TYPES:
        click.echo("Unsupported media type {}".format(media_type))
        click.echo(
            "Please request support at https://github.com/iscc/iscc-cli/issues"
        )
        return

    if media_type == "application/x-mobipocket-ebook":
        data.seek(0)
        tempdir, filepath = mobi.extract(data)
        tika_result = parser.from_file(filepath)
        shutil.rmtree(tempdir)
    else:
        data.seek(0)
        tika_result = parser.from_buffer(data)

    if not title:
        title = get_title(tika_result, guess=guess, uri=url)

    mid, norm_title, _ = iscc.meta_id(title, extra)
    gmt = mime_to_gmt(media_type)
    if gmt == GMT.IMAGE:
        data.seek(0)
        cid = iscc.content_id_image(data)
    elif gmt == GMT.TEXT:
        text = tika_result["content"]
        if not text:
            click.echo("Could not extract text")
            return
        cid = iscc.content_id_text(tika_result["content"])
    elif gmt == GMT.AUDIO:
        if not fpcalc.is_installed():
            fpcalc.install()
        data.seek(0)
        features = audio_id.get_chroma_vector(data)
        cid = audio_id.content_id_audio(features)
    elif gmt == GMT.VIDEO:
        local_path = download_file(url, sanitize=True)
        features = video_id.get_frame_vectors(local_path)
        cid = video_id.content_id_video(features)
        os.remove(local_path)

    data.seek(0)
    did = iscc.data_id(data)
    data.seek(0)
    iid, tophash = iscc.instance_id(data)

    if not norm_title:
        iscc_code = "-".join((cid, did, iid))
    else:
        iscc_code = "-".join((mid, cid, did, iid))

    click.echo("ISCC:{}".format(iscc_code))

    if verbose:
        if norm_title:
            click.echo("Norm Title: %s" % norm_title)
        click.echo("Tophash:    %s" % tophash)
        click.echo("Filepath:   %s" % url)
        click.echo("GMT:        %s" % gmt)

    return dict(iscc=iscc_code,
                norm_title=norm_title,
                tophash=tophash,
                gmt=gmt)
Example #11
0
def test_extract_file_like():
    with open(join(TEST_DIR, "demo.mobi"), "rb") as infile:
        tempdir, filepath = mobi.extract(infile)
        assert exists(tempdir)
        assert exists(filepath)
        shutil.rmtree(tempdir)
Example #12
0
from bs4 import BeautifulSoup
from weasyprint import HTML, CSS, default_url_fetcher
import mobi
import shutil
import sys

if __name__ == "__main__": 
    unzip_file_path=sys.argv[1]
    tempdir, filepath = mobi.extract(unzip_file_path)
    image_base=filepath[:-9]
    html=HTML(filename=filepath,base_url=image_base,encoding="utf8")
    filename=unzip_file_path.split("/")[-1]
    html.write_pdf(filename+'.pdf')
    shutil.rmtree(tempdir)
Example #13
0
def from_file(file: UploadFile = File(...),
              title: str = Form(""),
              extra: str = Form("")):
    """Generate Full ISCC Code from Media File with optional explicit metadata."""

    media_type = detector.from_buffer(file.file)
    if media_type not in SUPPORTED_MIME_TYPES:
        raise HTTPException(
            HTTP_415_UNSUPPORTED_MEDIA_TYPE,
            "Unsupported media type '{}'. Please request support at "
            "https://github.com/iscc/iscc-service/issues.".format(media_type),
        )

    if media_type == "application/x-mobipocket-ebook":
        file.file.seek(0)
        tempdir, filepath = mobi.extract(file.file)
        tika_result = parser.from_file(filepath)
        shutil.rmtree(tempdir)
    else:
        file.file.seek(0)
        tika_result = parser.from_buffer(file.file)

    if not title:
        title = get_title(tika_result, guess=True)

    mid, norm_title, norm_extra = iscc.meta_id(title, extra)
    gmt = mime_to_gmt(media_type)
    if gmt == GMT.IMAGE:
        file.file.seek(0)
        cid = iscc.content_id_image(file.file)
    elif gmt == GMT.TEXT:
        text = tika_result["content"]
        if not text:
            raise HTTPException(HTTP_422_UNPROCESSABLE_ENTITY,
                                "Could not extract text")
        cid = iscc.content_id_text(tika_result["content"])
    elif gmt == GMT.AUDIO:
        file.file.seek(0)
        features = audio_id.get_chroma_vector(file.file)
        cid = audio_id.content_id_audio(features)
    elif gmt == GMT.VIDEO:
        file.file.seek(0)
        _, ext = splitext(file.filename)
        fn = "{}{}".format(uuid.uuid4(), ext)
        tmp_path = join(APP_DIR, fn)
        with open(tmp_path, "wb") as buffer:
            shutil.copyfileobj(file.file, buffer)
        features = video_id.get_frame_vectors(tmp_path)
        cid = video_id.content_id_video(features)
        os.remove(tmp_path)

    file.file.seek(0)
    did = iscc.data_id(file.file)
    file.file.seek(0)
    iid, tophash = iscc.instance_id(file.file)

    if not norm_title:
        iscc_code = "-".join((cid, did, iid))
    else:
        iscc_code = "-".join((mid, cid, did, iid))

    components = iscc_split(iscc_code)

    result = dict(
        iscc=iscc_code,
        tophash=tophash,
        gmt=gmt,
        bits=[code_to_bits(c) for c in components],
    )
    if norm_title:
        result["title"] = title
        result["title_trimmed"] = norm_title
    if norm_extra:
        result["extra"] = extra
        result["extra_trimmed"] = norm_extra

    file.file.close()
    return result