Beispiel #1
0
        def endElement(self, name):
            if name != 'binary':
                return

            data = PARSE_DATA['last_tag_data']

            try:
                im_id = PARSE_DATA['last_tag_attrs']['id']
                content_type = PARSE_DATA['last_tag_attrs']['content-type']

                im_file_name = get_file_name_from_binary(im_id, content_type)
                im_file_name = os.path.join(dir_im, im_file_name)

                im_data = base64.b64decode(data.encode())

                count_bytes = len(im_data)
                PARSE_DATA['total_image_size'] += count_bytes

                with open(im_file_name, mode='wb') as f:
                    f.write(im_data)

                im = Image.open(io.BytesIO(im_data))
                debug and print('    {}. {} {} format={} size={}'.format(
                    PARSE_DATA['number'], im_id, sizeof_fmt(count_bytes),
                    im.format, im.size))

                PARSE_DATA['number'] += 1

            except:
                import traceback
                traceback.print_exc()
Beispiel #2
0
def do(file_name, output_dir='output', debug=True):
    dir_fb2 = os.path.basename(file_name)
    dir_im = os.path.join(output_dir, dir_fb2)
    os.makedirs(dir_im, exist_ok=True)
    debug and print(dir_im + ':')

    total_image_size = 0

    with open(file_name, encoding='utf8') as fb2:
        pattern = re.compile(
            '<binary ((content-type=".+?") (id=".+?")'
            '|(id=".+?") (content-type=".+?")) *?>(.+?)</binary>', re.DOTALL)

        find_content_type = re.compile('content-type="(.+?)"')
        find_id = re.compile('id="(.+?)"')

        for i, binary in enumerate(pattern.findall(fb2.read()), 1):
            try:
                im_id, content_type, im_base64 = None, None, None

                for part in binary:
                    if not part:
                        continue

                    match_id = find_id.search(part)
                    if im_id is None and match_id is not None:
                        im_id = match_id.group(1)

                    match_content_type = find_content_type.search(part)
                    if content_type is None and match_content_type is not None:
                        content_type = match_content_type.group(1)

                    if match_id is None and match_content_type is None:
                        im_base64 = part

                im_file_name = get_file_name_from_binary(im_id, content_type)
                im_file_name = os.path.join(dir_im, im_file_name)

                im_data = base64.b64decode(im_base64.encode())

                count_bytes = len(im_data)
                total_image_size += count_bytes

                with open(im_file_name, mode='wb') as f:
                    f.write(im_data)

                im = Image.open(io.BytesIO(im_data))
                debug and print('    {}. {} {} format={} size={}'.format(
                    i, im_id, sizeof_fmt(count_bytes), im.format, im.size))

            except:
                import traceback
                traceback.print_exc()

        file_size = os.path.getsize(file_name)
        debug and print()
        debug and print('fb2 file size =', sizeof_fmt(file_size))
        debug and print('total image size = {} ({:.2f}%)'.format(
            sizeof_fmt(total_image_size), total_image_size / file_size * 100))
Beispiel #3
0
def do(file_name, output_dir='output', debug=True):
    dir_fb2 = os.path.basename(file_name)
    dir_im = os.path.join(output_dir, dir_fb2)
    if not os.path.exists(dir_im):
        os.makedirs(dir_im)
    debug and print(dir_im + ':')

    total_image_size = 0
    number = 1

    tree = ET.parse(file_name)
    root = tree.getroot()

    for child in root:
        tag = child.tag
        if "}" in tag:
            tag = tag[tag.index('}') + 1:]

        if tag != 'binary':
            continue

        try:
            im_id = child.attrib['id']
            content_type = child.attrib['content-type']

            im_file_name = get_file_name_from_binary(im_id, content_type)
            im_file_name = os.path.join(dir_im, im_file_name)

            im_data = base64.b64decode(child.text.encode())

            count_bytes = len(im_data)
            total_image_size += count_bytes

            with open(im_file_name, mode='wb') as f:
                f.write(im_data)

            im = Image.open(io.BytesIO(im_data))
            debug and print('    {}. {} {} format={} size={}'.format(
                number, im_id, sizeof_fmt(count_bytes), im.format, im.size))

            number += 1

        except:
            import traceback
            traceback.print_exc()

    file_size = os.path.getsize(file_name)
    debug and print()
    debug and print('fb2 file size =', sizeof_fmt(file_size))
    debug and print('total image size = {} ({:.2f}%)'.format(
        sizeof_fmt(total_image_size), total_image_size / file_size * 100))
def do(file_name, output_dir='output', debug=True):
    dir_fb2 = os.path.basename(file_name)
    dir_im = os.path.join(output_dir, dir_fb2)
    if not os.path.exists(dir_im):
        os.makedirs(dir_im)
    debug and print(dir_im + ':')

    total_image_size = 0

    with open(file_name, 'rb') as fb2:
        root = BeautifulSoup(fb2, 'html.parser')

        binaries = root.select("binary")
        for i, binary in enumerate(binaries, 1):
            try:
                im_id = binary.attrs['id']
                content_type = binary.attrs['content-type']

                im_file_name = get_file_name_from_binary(im_id, content_type)
                im_file_name = os.path.join(dir_im, im_file_name)

                im_data = base64.b64decode(binary.text.encode())

                count_bytes = len(im_data)
                total_image_size += count_bytes

                with open(im_file_name, mode='wb') as f:
                    f.write(im_data)

                im = Image.open(io.BytesIO(im_data))
                debug and print('    {}. {} {} format={} size={}'.format(
                    i, im_id, sizeof_fmt(count_bytes), im.format, im.size
                ))

            except:
                import traceback
                traceback.print_exc()

    file_size = os.path.getsize(file_name)
    debug and print()
    debug and print('fb2 file size =', sizeof_fmt(file_size))
    debug and print('total image size = {} ({:.2f}%)'.format(
        sizeof_fmt(total_image_size), total_image_size / file_size * 100
    ))
Beispiel #5
0
def do(file_name, output_dir='output', debug=True):
    dir_fb2 = os.path.basename(file_name)
    dir_im = os.path.join(output_dir, dir_fb2)
    os.makedirs(dir_im, exist_ok=True)
    debug and print(dir_im + ':')

    total_image_size = 0

    with open(file_name, 'rb') as fb2:
        tree = etree.XML(fb2.read())

        binaries = tree.xpath("//*[local-name()='binary']")
        for i, binary in enumerate(binaries, 1):
            try:
                im_id = binary.attrib['id']
                content_type = binary.attrib['content-type']

                im_file_name = get_file_name_from_binary(im_id, content_type)
                im_file_name = os.path.join(dir_im, im_file_name)

                im_data = base64.b64decode(binary.text.encode())

                count_bytes = len(im_data)
                total_image_size += count_bytes

                with open(im_file_name, mode='wb') as f:
                    f.write(im_data)

                im = Image.open(io.BytesIO(im_data))
                debug and print('    {}. {} {} format={} size={}'.format(
                    i, im_id, sizeof_fmt(count_bytes), im.format, im.size
                ))

            except:
                import traceback
                traceback.print_exc()

    file_size = os.path.getsize(file_name)
    debug and print()
    debug and print('fb2 file size =', sizeof_fmt(file_size))
    debug and print('total image size = {} ({:.2f}%)'.format(
        sizeof_fmt(total_image_size), total_image_size / file_size * 100
    ))