Example #1
0
def pdf_to_jpg(pdf_path):
    with open(pdf_path, 'rb') as fp:
        # 画像を格納するディレクトリを作成
        out_dir = utils.make_outdir(pdf_to_jpg_dir, basename(pdf_path))
        # 既に実行結果のようなものがある場合はスキップ
        if len(os.listdir(out_dir)) > 10:
            print('dir exists: skip')
            return

        with utils.timer('extract: ' + pdf_path):
            parser, document = prev_pdf_parser(fp)
            # Check if the document allows text extraction. If not, abort.
            if not document.is_extractable:
                raise PDFException
            pages = PDFPage.create_pages(document)

            # Process each page contained in the document.
            for idx, page in enumerate(pages):
                page_num = str(idx).zfill(4)
                layouts = prev_pdf_layout(page)
                for layout in layouts:
                    for thing in layout:
                        if isinstance(thing, LTImage):
                            # save_image(thing, out_dir, 'image-' + page_num + '.png')
                            save_image(thing, out_dir,
                                       'image-' + page_num + '.jpg')
Example #2
0
def pdf_to_png(root_path):
    for dirpath, _, filenames in os.walk(root_path):
        for filename in filenames:
            if not filename.endswith('.pdf'):
                continue
            # if 'kurumi' not in filename:
            #     continue

            # 画像を格納するディレクトリを作成
            output_path = utils.make_outdir(root_path, filename[:-4])

            # 既に実行結果のようなものがある場合はスキップ
            if len(os.listdir(output_path)) > 3:
                continue

            pdf_path = os.path.join(dirpath, filename)
            png_path = os.path.join(output_path, 'image-%03d.jpg')
            # png_path = os.path.join(output_path, 'image-%03d.png')

            argv = ['convert',
                    '-define', 'jpeg:size={0}x{0}'.format(config.MAX_HEIGHT),  # 大きい画像を変換するときに速くなるかも
                    # '-deskew', '40%',  # 傾きを補正
                    '-density', str(config.DENSITY_LOW),  # 解像度を指定。未指定は72dpiになってしまう
                    # '-geometry', '{0}x{0}'.format(MAX_HEIGHT),  # 最大の画像大きさを指定。縦横比は守られる
                    pdf_path, png_path]

            print('exec_code: {}'.format(argv))
            with utils.timer('convert: {} -> {}'.format(pdf_path, png_path)):
            # print('convert: {} -> {}'.format(pdf_path, png_path))

                # コマンドを実行
                if subprocess.call(argv) != 0:
                    print('failed: {}'.format(pdf_path))
Example #3
0
def devide_jpgs(jpg_dir, norm_img_size=True):
    '''カラーならc.jpg、グレーでよければグレスケ化してg.jpgで保存'''
    out_dir = utils.make_outdir(jpg_dir, NEW_JPG_DIR)
    for jpg_path in utils.get_path_list(jpg_dir, 'jpg'):
        img = cv2.imread(jpg_path)
        if img is None:
            continue

        # norm_img_sizeが指定されていればリサイズ
        if norm_img_size:
            img = utils.img_resize(img, max_height=config.MAX_HEIGHT)

        if is_color_img(img):
            cv2.imwrite(
                join(out_dir,
                     basename(jpg_path).replace('.jpg', 'c.jpg')), img)
        else:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            cv2.imwrite(
                join(out_dir,
                     basename(jpg_path).replace('.jpg', 'g.jpg')), img)
Example #4
0
                box = [(v.x, v.y) for v in text.bounding_poly.vertices]
                draw.line(box + [box[0]], width=2, fill=color)
        return img_p


pickle_path = Path(utils.check_argv_path(sys.argv))
img_dir = pickle_path / '..' / pickle_path.name

path_serif_list = []
ta_list = utils.pickle_load(pickle_path)['values']

for idx, val in enumerate(ta_list[:]):
    img_path = val['image_path']
    tas_orig = val['text_annotation']
    bal = Balloon(img_path, tas_orig)
    draw = bal.highlight_texts()
    serif = bal.make_serif()
    path_serif_list.append([img_path, serif])

csv_dir = utils.make_outdir((pickle_path / '..' / '..').resolve(), 'csv')

with open(Path(csv_dir) / f'{pickle_path.name}.csv', 'w') as csv_file:
    fieldnames = ['img_path', 'serif']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    [
        writer.writerow({
            fieldnames[0]: p,
            fieldnames[1]: s
        }) for p, s in path_serif_list
    ]
Example #5
0
                    type=int,
                    help=('set padding size(px) for y'))
parser.add_argument('--ext', default='jpg', help=('target ext'))
args = parser.parse_args()
print(args)

image_dir = utils.check_argv_path(args)
padding_x = args.pad_x
padding_y = args.pad_y
good_cons_num_y = 4
if args.with_subtitle or args.shave_subtitle:
    good_cons_num_y += 1

# 出力ディレクトリ・パスを準備
outdir_name = '2_paint_out'
output_path = utils.make_outdir(image_dir, outdir_name)

output_koma_path = utils.make_outdir(output_path, '0_koma')
if len(os.listdir(output_koma_path)) >= 3:
    shutil.rmtree(output_path)
    output_path = utils.make_outdir(image_dir, outdir_name)
    output_koma_path = utils.make_outdir(output_path, '0_koma')
output_shaved_path = utils.make_outdir(output_koma_path, '0_padding_shave')

# paint_out処理: 1st
img_path_list = utils.get_path_list(image_dir, args.ext)
print('pages:', len(img_path_list) - (args.start + args.end))
with utils.timer('paint_out処理: 1st 切り抜き位置が求められた画像を切り抜き'):
    odd_cp_list = []  # 奇数idxページのカットポイントを格納
    even_cp_list = []  # 偶数idxページのカットポイントを格納
    not_cut_img_path_dict = {}
Example #6
0
# 1ページの画像から各コマを切り出す処理
import os
import sys

import cv2

from cut import AverageDiffCut
import utils

if __name__ == '__main__':
    image_dir = utils.check_argv_path(sys.argv)
    outdir_name = 'koma'
    output_path = utils.make_outdir(image_dir, outdir_name)
    img_path_list = utils.get_path_list(image_dir, 'jpg')

    adc = AverageDiffCut()

    # 切り出し座標=カットポイント(cp)を探すためのループ
    print('切り出し座標を検出しています')
    odd_cp_list = []  # 奇数idxページのカットポイントを格納
    even_cp_list = []  # 偶数idxページのカットポイントを格納
    not_cut_img_path_dict = {}
    for idx, img_path in enumerate(img_path_list):
        img = cv2.imread(img_path)
        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # 切り出し座標の探索
        cp_dict = adc.search_cut_point(img_gray)

        print(idx, cp_dict, len(cp_dict['x']), len(cp_dict['y']))
        # 切り出し座標が規定の数だけ帰ってきたらリストに追加
Example #7
0
def pdf_to_page1(pdf_path):
    with open(pdf_path, 'rb') as fp:
        parser, document = prev_pdf_parser(fp)

        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise PDFException
        # 最初のページを取得
        page1 = PDFPage.create_pages(document).__next__()

        layouts = prev_pdf_layout(page1)
        for layout in layouts:
            for thing in layout:
                save_image(thing,
                           dirname(pdf_path),
                           TMP_FILENAME_PAGE1,
                           max_width=MAX_WIDTH_JPG1)

    return join(dirname(pdf_path), TMP_FILENAME_PAGE1)


if __name__ == '__main__':
    pdf_dir = utils.check_argv_path(sys.argv)
    pdf_path_list = utils.get_path_list(pdf_dir, 'pdf')
    pdf_path_list = [path for path in pdf_path_list if not os.path.isdir(path)]
    pdf_to_jpg_dir = utils.make_outdir(pdf_dir, PDF_TO_JPG_DIR)

    print('pdf files:', len(pdf_path_list))
    for pdf_path in pdf_path_list:
        pdf_to_jpg(pdf_path)
Example #8
0
        box = [(v.get('x', 0.0), v.get('y', 0.0))
               for v in text['boundingPoly']['vertices']]
        draw.line(box + [box[0]], width=2, fill=color)
    return img


if __name__ == '__main__':
    image_dir = Path(utils.check_argv_path(sys.argv))

    key_file_path = '/Users/esuji/Dropbox/program/cvtest/yonkoma2data/src/MyFirstProject-b8427fd28b8d.json'
    os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = key_file_path

    # Instantiates a client
    client = vision.ImageAnnotatorClient()

    pickle_dir = utils.make_outdir(image_dir / '..', 'pickles')
    image_path_list = sorted(utils.get_path_list(image_dir, '.jpg'))

    ta_list = []
    for idx_koma, image_path in enumerate(image_path_list[100:110]):
        print(idx_koma, image_path)

        try:
            image = get_image(image_path)
            ta = get_response_text_annotations(image)
            ta_list.append({
                'image_path': image_path,
                'text_annotation': list(ta)
            })
            print(ta[0].description)
        except Exception as e:
Example #9
0
import sys
import shutil
from collections import defaultdict

import utils


if __name__ == '__main__':
    pdf_dir = utils.check_argv_path(sys.argv)
    pdf_path_list = utils.get_path_list(pdf_dir, 'pdf')

    # 作者名:pdfパスの辞書を作る
    author_dict = defaultdict(list)
    for pdf_path in pdf_path_list:
        if len(pdf_path.split('[')) != 3:
            continue
        author = '[{}]'.format(pdf_path.split('[')[2].split(']')[0]).replace(' ', '')
        author_dict[author].append(pdf_path)
    # 作者名毎にmkdirしてpdfファイルを移動
    for author, writing_path_list in author_dict.items():
        author_dir = utils.make_outdir(pdf_dir, author)
        for writing_path in writing_path_list:
            shutil.move(writing_path, author_dir)
Example #10
0
def jpg_to_pdf(jpg_dir_path, pdf_path):
    # convert -quality 20 *.jpg 元pdf_clean.pdf
    argv = [
        'convert',
        '-quality',
        str(config.PDF_QUALITY),  # pdfの圧縮率
        join(jpg_dir_path, '*.jpg'),  # すべてのjpgを対象
        pdf_path.replace('.pdf', '{}.pdf'.format(CLEAN_PDF_SUFFIX))  # 出力するpdf
    ]
    run_process(argv)


if __name__ == '__main__':
    pdf_dir = utils.check_argv_path(sys.argv)
    pdf_path_list = utils.get_path_list(pdf_dir, '.pdf')
    clean_pdf_dir = utils.make_outdir(pdf_dir, CLEAN_PDF_DIR)

    pdf_path_list = [path for path in pdf_path_list if '[' not in path]
    pdf_path_list = [path for path in pdf_path_list if '31' in path]
    print('pdf files:', len(pdf_path_list))

    for pdf_path in pdf_path_list:
        jpg_dir_path = join(os.path.dirname(pdf_path), PDF_TO_JPG_DIR,
                            basename(pdf_path))

        if not os.path.isdir(jpg_dir_path):
            continue

        with utils.timer('jpgファイルをグレスケとカラーで分別'):
            devide_jpgs(jpg_dir_path, norm_img_size=False)