forked from writecrow/ocr2text
-
Notifications
You must be signed in to change notification settings - Fork 0
/
images2pdf.py
71 lines (62 loc) · 2.49 KB
/
images2pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from fpdf import FPDF
import os
import ocr2text2 as ocr2text
from glob import glob
folder_img = "img"
folder_pdf = "pdf"
folder_output = "txt"
# img to pdf
def group_imgs_by_name(file_name):
# list_of_images = ["img/topic1-1.jpg", "img/topic1-2.jpg"]
print("reading images:" + os.path.join(folder_img, file_name, '*.*'))
list_of_images = sorted(glob(os.path.join(folder_img, file_name + '*.*')), key=os.path.getmtime)
print(list_of_images)
path = ""
if len(list_of_images) > 0:
pdf = FPDF(orientation='L')
pdf.compress = False
for image in list_of_images:
pdf.add_page()
pdf.image(image, w=250)
path = os.path.join(folder_pdf, file_name + ".pdf")
pdf.output(path, "F")
print(folder_pdf + "/" + file_name + ".pdf" + ' converted')
return path
# pdf to txt
def pdf_to_txt(path):
count = 0
dir_path = os.path.dirname(os.path.realpath(__file__))
# print('Source file or folder of PDF(s) [' + dir_path + ']:')
# print('(Press [Enter] for current working directory)')
# source = input()
# if source == '':
# source = dir_path
source = os.path.join(dir_path, path if path != '' else folder_pdf)
print(source)
# print('Destination folder for TXT [' + dir_path + ']:')
# print('(Press [Enter] for current working directory)')
# destination = input()
# if destination == '':
# destination = dir_path
destination = os.path.join(dir_path, folder_output)
if (os.path.exists(source)):
if (os.path.isdir(source)):
count = ocr2text.convert_recursive(source, destination, count)
elif os.path.isfile(source):
filepath, fullfile = os.path.split(source)
filename, file_extension = os.path.splitext(fullfile)
if (file_extension.lower() == '.pdf'):
count = ocr2text.convert(source, os.path.join(destination, filename + '.txt'), count, 1)
plural = 's'
if count == 1:
plural = ''
print(str(count) + ' file' + plural + ' converted')
else:
print('The path ' + source + 'seems to be invalid')
if __name__ == '__main__':
# put images in /img folder and format the name into xxx-n.zzz;
# eg: 1-1.jpg, 1-2.jpg;
# all images matches {}-*.* ( "{}-" can be changed bellow ) will be merge into one file
# for n in range(30):
# pdf_to_txt(group_imgs_by_name("{}-".format(n + 1)))
pdf_to_txt(group_imgs_by_name('Screen Shot 2020-08-07 at 15.25.56'))