def endElement(self, name): if name != 'binary': return data = PARSE_DATA['last_tag_data'] try: im_id = PARSE_DATA['last_tag_attrs']['id'] content_type = PARSE_DATA['last_tag_attrs']['content-type'] im_file_name = get_file_name_from_binary(im_id, content_type) im_file_name = os.path.join(dir_im, im_file_name) im_data = base64.b64decode(data.encode()) count_bytes = len(im_data) PARSE_DATA['total_image_size'] += count_bytes with open(im_file_name, mode='wb') as f: f.write(im_data) im = Image.open(io.BytesIO(im_data)) debug and print(' {}. {} {} format={} size={}'.format( PARSE_DATA['number'], im_id, sizeof_fmt(count_bytes), im.format, im.size)) PARSE_DATA['number'] += 1 except: import traceback traceback.print_exc()
def do(file_name, output_dir='output', debug=True): dir_fb2 = os.path.basename(file_name) dir_im = os.path.join(output_dir, dir_fb2) os.makedirs(dir_im, exist_ok=True) debug and print(dir_im + ':') total_image_size = 0 with open(file_name, encoding='utf8') as fb2: pattern = re.compile( '<binary ((content-type=".+?") (id=".+?")' '|(id=".+?") (content-type=".+?")) *?>(.+?)</binary>', re.DOTALL) find_content_type = re.compile('content-type="(.+?)"') find_id = re.compile('id="(.+?)"') for i, binary in enumerate(pattern.findall(fb2.read()), 1): try: im_id, content_type, im_base64 = None, None, None for part in binary: if not part: continue match_id = find_id.search(part) if im_id is None and match_id is not None: im_id = match_id.group(1) match_content_type = find_content_type.search(part) if content_type is None and match_content_type is not None: content_type = match_content_type.group(1) if match_id is None and match_content_type is None: im_base64 = part im_file_name = get_file_name_from_binary(im_id, content_type) im_file_name = os.path.join(dir_im, im_file_name) im_data = base64.b64decode(im_base64.encode()) count_bytes = len(im_data) total_image_size += count_bytes with open(im_file_name, mode='wb') as f: f.write(im_data) im = Image.open(io.BytesIO(im_data)) debug and print(' {}. {} {} format={} size={}'.format( i, im_id, sizeof_fmt(count_bytes), im.format, im.size)) except: import traceback traceback.print_exc() file_size = os.path.getsize(file_name) debug and print() debug and print('fb2 file size =', sizeof_fmt(file_size)) debug and print('total image size = {} ({:.2f}%)'.format( sizeof_fmt(total_image_size), total_image_size / file_size * 100))
def do(file_name, output_dir='output', debug=True): dir_fb2 = os.path.basename(file_name) dir_im = os.path.join(output_dir, dir_fb2) if not os.path.exists(dir_im): os.makedirs(dir_im) debug and print(dir_im + ':') total_image_size = 0 number = 1 tree = ET.parse(file_name) root = tree.getroot() for child in root: tag = child.tag if "}" in tag: tag = tag[tag.index('}') + 1:] if tag != 'binary': continue try: im_id = child.attrib['id'] content_type = child.attrib['content-type'] im_file_name = get_file_name_from_binary(im_id, content_type) im_file_name = os.path.join(dir_im, im_file_name) im_data = base64.b64decode(child.text.encode()) count_bytes = len(im_data) total_image_size += count_bytes with open(im_file_name, mode='wb') as f: f.write(im_data) im = Image.open(io.BytesIO(im_data)) debug and print(' {}. {} {} format={} size={}'.format( number, im_id, sizeof_fmt(count_bytes), im.format, im.size)) number += 1 except: import traceback traceback.print_exc() file_size = os.path.getsize(file_name) debug and print() debug and print('fb2 file size =', sizeof_fmt(file_size)) debug and print('total image size = {} ({:.2f}%)'.format( sizeof_fmt(total_image_size), total_image_size / file_size * 100))
def do(file_name, output_dir='output', debug=True): dir_fb2 = os.path.basename(file_name) dir_im = os.path.join(output_dir, dir_fb2) if not os.path.exists(dir_im): os.makedirs(dir_im) debug and print(dir_im + ':') total_image_size = 0 with open(file_name, 'rb') as fb2: root = BeautifulSoup(fb2, 'html.parser') binaries = root.select("binary") for i, binary in enumerate(binaries, 1): try: im_id = binary.attrs['id'] content_type = binary.attrs['content-type'] im_file_name = get_file_name_from_binary(im_id, content_type) im_file_name = os.path.join(dir_im, im_file_name) im_data = base64.b64decode(binary.text.encode()) count_bytes = len(im_data) total_image_size += count_bytes with open(im_file_name, mode='wb') as f: f.write(im_data) im = Image.open(io.BytesIO(im_data)) debug and print(' {}. {} {} format={} size={}'.format( i, im_id, sizeof_fmt(count_bytes), im.format, im.size )) except: import traceback traceback.print_exc() file_size = os.path.getsize(file_name) debug and print() debug and print('fb2 file size =', sizeof_fmt(file_size)) debug and print('total image size = {} ({:.2f}%)'.format( sizeof_fmt(total_image_size), total_image_size / file_size * 100 ))
def do(file_name, output_dir='output', debug=True): dir_fb2 = os.path.basename(file_name) dir_im = os.path.join(output_dir, dir_fb2) os.makedirs(dir_im, exist_ok=True) debug and print(dir_im + ':') total_image_size = 0 with open(file_name, 'rb') as fb2: tree = etree.XML(fb2.read()) binaries = tree.xpath("//*[local-name()='binary']") for i, binary in enumerate(binaries, 1): try: im_id = binary.attrib['id'] content_type = binary.attrib['content-type'] im_file_name = get_file_name_from_binary(im_id, content_type) im_file_name = os.path.join(dir_im, im_file_name) im_data = base64.b64decode(binary.text.encode()) count_bytes = len(im_data) total_image_size += count_bytes with open(im_file_name, mode='wb') as f: f.write(im_data) im = Image.open(io.BytesIO(im_data)) debug and print(' {}. {} {} format={} size={}'.format( i, im_id, sizeof_fmt(count_bytes), im.format, im.size )) except: import traceback traceback.print_exc() file_size = os.path.getsize(file_name) debug and print() debug and print('fb2 file size =', sizeof_fmt(file_size)) debug and print('total image size = {} ({:.2f}%)'.format( sizeof_fmt(total_image_size), total_image_size / file_size * 100 ))