Esempio n. 1
0
    def extract_images(self):
        """
        Extract all images from the pdf file
        """
        pdf_file = Path(self.input_file)
        doc = minecart.Document(open(pdf_file, 'rb'))

        # Creating temporary folder
        try:
            os.makedirs(self.temp_path)
        except OSError:
            print("Temporary directory already exists")
        else:
            print("Successfully created the directory %s" % self.temp_path)
        # Extract images
        n_imgs = 0
        for p, page in enumerate(doc.iter_pages()):
            for i, im in enumerate(page.images):
                im.as_pil().save(self.temp_path / f"image_{p+1}.jpg")
                n_imgs += 1
            print(f"Extracting image {i+1} from page {p+1}")

        # Update images
        self.images = glob.glob(str(self.temp_path) + "/*.jpg")
        return n_imgs
Esempio n. 2
0
def process_file(filename):
    pdffile = open(filename, 'rb')
    doc = minecart.Document(pdffile)

    with open(filename +'.csv', 'w') as csvfile:
        csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        pagen = 0
        #iterating through all pages
        for page in doc.iter_pages():
            pagen += 1
            if len(page.images) == 0:
                print("Page %d: No Images found" % (pagen))
                continue
            im = page.images[0].as_pil()  # requires
            im = im.convert('L') #validate grayscale
            gray_image = np.array(im)
            config = ("-l eng --oem 1 --psm 7")
            extracted_table = extract_main_table(gray_image)
            if DEBUG:
             show_wait_destroy("extracted",extracted_table)
            row_images = extract_rows_columns(extracted_table) #[1:]
            if len(row_images) == 0:
                continue

            idx = 0
            for row in row_images:
                idx += 1
                print("%s : Extracting row %d out of %d page %d" % (filename, idx,len(row_images), pagen))
                row_texts = []
                for column in row:
                    text = pytesseract.image_to_string(column, config=config)
                    row_texts.append(text)

                csv_writer.writerow(row_texts)
Esempio n. 3
0
def load_pdf(pdf_path: Path) -> List[PdfPageContainer]:

    target_color = (1, 0, 0.498039)
    list_result = []
    with open(pdf_path, "rb") as file:

        doc = minecart.Document(file)

        for page_el in doc.iter_pages():
            new_container = PdfPageContainer()
            for letter_el in page_el.letterings:
                bbox = letter_el.get_bbox()
                new_container.triples.append(
                    (int(bbox[0]), int(bbox[1]), str(letter_el)))

            filled_shapes = [
                el for el in page_el.shapes if el.fill is not None
            ]
            new_container.shapes = [
                el for el in filled_shapes
                if el.fill.color.as_rgb() == target_color
            ]

            list_result.append(new_container)
    return list_result
Esempio n. 4
0
    def get_axis_dict(self):
        import minecart
        # open pdf file
        pdffile = open(self.data['name'] + '.pdf', 'rb')
        doc = minecart.Document(pdffile)
        page = doc.get_page(0)

        #Find colored box shapes that share the maximal x coordinate. That's the color legend (z_axis)
        colored_shapes = []
        for shape in page.shapes:
            # these colored boxes have identical stroke and fill color and are neither black or white
            if shape.fill and shape.stroke and hasattr(
                    shape.stroke, 'color') and shape.stroke.color.as_rgb(
                    ) == shape.fill.color.as_rgb():
                if shape.fill.color.as_rgb() in [(1, 1, 1), (0, 0, 0)]:
                    continue
                #print (shape.fill.color.as_rgb(), len(shape.path))
                if len(shape.path) != 6:
                    raise RuntimeError("You need to look at this shape: %r" %
                                       shape.path)
                #there are two 'h' objects at the end
                #y_vals.append(shape.path[-3])
                colored_shapes.append(shape)
        pdffile.close()

        # global max_x for all appropriately colored shapes
        max_x_global = max(map(max_x, colored_shapes))

        self.z_axis_shapes = list(
            filter(lambda s: max_x(s) == max_x_global, colored_shapes))
        self.z_axis_dict = {}
        for shape in self.z_axis_shapes:
            self.z_axis_dict[tuple(shape.fill.color.as_rgb())] = {
                'ymin': shape.path[0][2],
                'ymax': shape.path[2][2]
            }
        self.z_axis_ymax = max([d['ymax'] for d in self.z_axis_dict.values()])
        self.z_axis_ymin = min([d['ymin'] for d in self.z_axis_dict.values()])
        self.z_max_color = next(k for k, v in self.z_axis_dict.items()
                                if v['ymax'] == self.z_axis_ymax)
        self.z_min_color = next(k for k, v in self.z_axis_dict.items()
                                if v['ymin'] == self.z_axis_ymin)

        # These are the shapes (hopefully rectangular) of the main pad.
        # I take all non-BW shapes with identical fill and stroke color whose max_x isn't the global maximum of such shapes
        self.main_shapes = list(
            filter(lambda s: max_x(s) < max_x_global, colored_shapes))

        # max/min of the coordinates of the shapes in the PDF
        self.main_x_max = max(map(max_x, self.main_shapes))
        self.main_y_max = max(map(max_y, self.main_shapes))
        self.main_x_min = min(map(min_x, self.main_shapes))
        self.main_y_min = min(map(min_y, self.main_shapes))
Esempio n. 5
0
def get_region_boundary(pdf):
    with open(pdf, 'rb') as fp:
        doc = minecart.Document(fp)
        page = doc.get_page(0)
        shapes = [{"x1":shape.path[0][1], "y1": shape.path[0][2], "x2": shape.path[1][1], "y2": shape.path[1][2]} for shape in page.shapes]
        characters = [{"x1": letter.get_bbox()[0], "y1":letter.get_bbox()[1], "x2": letter.get_bbox()[2], "y2": letter.get_bbox()[3]} for letter in page.letterings]
        combined = shapes + characters
        x1 = min([item['x1'] for item in combined])
        y1 = max([item['y1'] for item in combined])
        x2 = max([item['x2'] for item in combined])
        y2 = min([item['y2'] for item in combined])
        return {"x1": x1, "y1": y1, "x2": x2, "y2": y2} 
Esempio n. 6
0
def image_finder(path):
    pdf = PdfFileReader(path)
    pdffile = open(path, 'rb')
    noOfPages = pdf.getNumPages()  # using PyPDF2 to get count of no. of pages
    #print(noOfPages)
    doc = minecart.Document(pdffile)
    count = 0
    for i in range(noOfPages):
        page = doc.get_page(i)
        #for shape in page.shapes.iter_in_bbox((0, 0, 100, 200)):
        #print(shape.path, shape.fill.color.as_rgb())
        count += len(page.images)
    return (count)
Esempio n. 7
0
 def test_ai_file_as_pdf(self):
     "Test real-world parsing of ICCBased colors."
     # Test file from snoyer/minecart
     pdfpath = os.path.join(os.path.dirname(__file__), 'testdocs',
                            'ai-files-are-pdfs.pdf')
     doc = minecart.Document(open(pdfpath, 'rb'))
     page = doc.get_page(0)
     red = (0.929, 0.11, 0.141)
     black = (0.137, 0.122, 0.125)
     blue = (0.18, 0.192, 0.573)
     self.assertEqual(
         set(tuple(shape.fill.color.as_rgb()) for shape in page.shapes),
         {red, black, blue})
Esempio n. 8
0
 def saveFileDialog(self):
     """
     Two modules are used to create and detect the pdfs
     Minecraft module: detect the color on each pdf page. more info can be foudn on Pypi.
     Warning: Minecraft didn't pass the unit test. High risk
     Pypdf2: PDF generator tools--more mature and developed
     """
     # return a tuple
     fileName, _filter = QFileDialog.getOpenFileName(self, "Open File")
     pdffile = PdfFileReader(open(fileName, "rb"))
     document = minecart.Document(open(fileName, "rb"))
     # Algorithm longest substring, use a queue to keep track of the same type of pages and store their indices. When the next page is not
     # the same type, queue will be emptied and pdf file will be created based on their indices
     queue = deque()
     for i in range(pdffile.getNumPages()):
         page = document.get_page(i)
         currentcolors = 0
         for shape in page.shapes:
             if shape.fill:
                 print(shape.fill.color.as_rgb())
                 if shape.fill.color.as_rgb() not in [(0, 0, 0), (1, 1, 1),
                                                      [0, 0, 0], [1, 1, 1]]:
                     currentcolors = 1
         if i == 0:
             previouscolors = currentcolors
             queue.append(i)
         else:
             print(currentcolors, previouscolors)
             if previouscolors == currentcolors and i != pdffile.getNumPages(
             ) - 1:
                 queue.append(i)
             else:
                 output = PdfFileWriter()
                 while (queue):
                     output.addPage(pdffile.getPage(queue.popleft()))
                 if previouscolors == 1:
                     with open("ColorDocument%s.pdf" % i,
                               "wb") as outputStream:
                         output.write(outputStream)
                 else:
                     with open("blackwhitedocument%s.pdf" % i,
                               "wb") as outputStream:
                         output.write(outputStream)
                 previouscolors = currentcolors
                 queue.append(i)
     while (queue):
         output.addPage(pdffile.getPage(queue.popleft()))
         with open("Document_last.pdf", "wb") as outputStream:
             output.write(outputStream)
     """
Esempio n. 9
0
def uploaded_file():
    file = open('pathofdocumet', 'rb')
    doc = minecart.Document(file)
    page = doc.iter_pages()
    pageref = []
    for j, i in enumerate(page):
        im = i.images[0].as_pil()
        im.save(app.config['docsfolder'] + f"/{j}.jpg")
    for i in range(6):
        print(os.path.join(app.config['docsfolder']))
        print(app.config['docsfolder'])
        pageref.append(os.path.join(app.config['docsfolder'], f'{i}.jpg'))
    print(pageref)

    return render_template("x.html", user_image=pageref)
def extract_table_image_count_pdf(resume):
    df = tabula.read_pdf('media\\' + resume.name,
                         pages="all",
                         multiple_tables=True)

    # print('number of table',len(df))
    pdffile = open("media\\" + resume.name, 'rb')
    doc = minecart.Document(pdffile)

    #page = doc.get_page(0) # getting a single page

    #iterating through all pages
    images_count = 0
    for page in doc.iter_pages():
        im = page.images
        images_count += len(im)

    # print('images count',images_count)
    return [len(df), images_count]
 def extract_img_minecart(self, full_file_name: str):
     """extract pdf images using minecart"""
     try:
         pdf_doc = open(full_file_name, 'rb')  # open the current pdf
         doc = minecart.Document(pdf_doc)
         for page in doc.iter_pages():
             m = 0  # counter for the number of images on the current page of the current pdf
             for i in range(len(page.images)):
                 try:
                     im = page.images[i].as_pil(
                     )  # convert the image into a PIL image
                     name = os.path.join(
                         self.write_path,
                         f'{os.path.basename(full_file_name)}_{i}_{m}.jpg')
                     m += 1
                     im.save(name)
                 except Exception as e:
                     print(e)
         pdf_doc.close()  # close the current pdf
     except Exception as e:
         print(e)
Esempio n. 12
0
def extractPDFImages(file_name, output_folder, img_dir):

    try:
        pdffile = open(file_name, 'rb')
        fileHandler = PyPDF2.PdfFileReader(open(file_name, "rb"))
        # print(fileHandler.numPages)
        doc = minecart.Document(pdffile)
        imglist = []
        output_folder = os.path.join(img_dir,
                                     getFilteredPath(file_name, False))

        if (os.path.exists(output_folder)):
            shutil.rmtree(output_folder)
        os.mkdir(output_folder)

        j = 1
        for i in range(0, fileHandler.numPages):
            try:
                page = doc.get_page(i)
                # print(page.images[0])
                # print(len(page.images))
                for image in page.images:
                    byteArray = image.obj.get_data()
                    with open(
                            os.path.join(output_folder,
                                         'image' + str(j) + '.png'),
                            'wb') as f:
                        f.write(byteArray)
                    imglist.append(
                        os.path.join(output_folder, 'image' + str(j) + '.png'))
                    j = j + 1
            except:
                continue
        pdffile.close()
        #print(imglist)
        return imglist
    except Exception as e:
        print(str(e))
Esempio n. 13
0
def _find_images(infile):
    document = minecart.Document(infile)
    if document is None:
        print 'the document is None!'
        sys.stdout.flush()
        return 'document was None'
    images = []
    '''
    for page_num, page in enumerate(document.iter_pages()):
    '''
    page_num = 0
    while True:
        page = document.get_page(page_num)
        if page is None:
            break
        page_num += 1
        for i in page.images:
            image_info = {}
            try:
                image_info['image_data'] = i.as_pil()
            except ValueError as e:
                print 'Got a ValueError, skipping'
            except PDFNotImplementedError:
                print 'Got a PDFNotImplementedError, skipping'
            else:
                image_info['bbox'] = i.get_bbox()
                image_info['page'] = page_num
                image_info['message'] = ''
                image_info['valid'] = False
                image_info['data'] = ''
                images.append(image_info)

    pprint.pprint(images)
    print len(images), 'images found'
    sys.stdout.flush()

    return images
Esempio n. 14
0
import cv2
import numpy as np
from pyzbar.pyzbar import decode
import time
import minecart
import glob
import xlwt

workbook = xlwt.Workbook()
sheet = workbook.add_sheet("Sheet 1")
style = xlwt.easyxf('font: bold 1')

pdffile = open('wagecard.pdf', 'rb')
doc = minecart.Document(pdffile)

page = doc.get_page(0)  # getting a single page

count = 1
# iterating through all pages
for page in doc.iter_pages():
    im = page.images[0].as_pil()  # requires pillow
    name = str(count) + '.jpg'
    count = count + 1
    im.save(name)

time.sleep(10)
# path = r'C:\Users\saad9\Desktop\FYP\CodeScanner'
# file location
path = glob.glob("*.jpg")
cv_img = []
for multiple_files in path:

def extract_output(page):
    """
    Reads the text from page and splits it into the 9 cells.

    Returns a list with 9 entries: 

        [A, B, C, D, E, F, G, H, I]

    Each item in the tuple contains a string with all of the
    text found in the cell.

    """
    res = []
    for box in BOXES:
        strings = list(page.letterings.iter_in_bbox(box))
        # We sort from top-to-bottom and then from left-to-right, based
        # on the strings' top left corner
        strings.sort(key=lambda x: (-x.bbox[3], x.bbox[0]))
        res.append(" ".join(strings).replace(u"\xa0", " ").strip())
    return res


content = []
doc = minecart.Document(
    open(
        "C:/Users/Sean/Desktop/projects/Pdfs_from_scan/The boxing register  International Boxing Hall of Fame official record book by Roberts, James B. Skutt, Alexander G (z-lib.org).pdf",
        encoding="utf-8"))
for page in doc.iter_pages():
    content.append(extract_output(page))
Esempio n. 16
0
import img2pdf

if not os.path.exists('in'):
	os.mkdir('in')
if os.path.exists('out'):
	shutil.rmtree('out')
os.mkdir('out')


for file in os.listdir('in'):
	if os.path.isfile('in/'+file):
		ext = file.split('.')
		if ext[1] == 'pdf':
			os.mkdir('out/'+ext[0])
			pdf_file = open('in/'+file, 'rb')
			pdf_doc = minecart.Document(pdf_file)
			page = pdf_doc.get_page(0)
			i=0
			j=0
			im = page.images[0]
			
			for page in pdf_doc.iter_pages():
				for im in page.images:
					width = im.as_pil().width // 3
					height = im.as_pil().height // 3
					print(im.as_pil().format, im.as_pil().size, im.as_pil().mode)
					print(width, height)
					#print(page.width, page.height)
					#print(im.as_pil().width, im.as_pil().height)
					new_filename="out/"+ext[0]+"/"+ext[0]+"_"+str(i)+str(j)+".jpg"
					im2=im.as_pil().resize((width, height), resample=3, box=None, reducing_gap=None)
import minecart
import PyPDF2
import cv2

#write a method "getPDFPage Color that iterates through the bigPDF and outputs the
#color of each page, making the association between index and the page color (looking for a way to
#flag PDF pages by color.
filePath = r"C:/Users/ericm/Desktop/bigScans/bigScan1.pdf"
pdfFileObj = open(filePath, 'rb')
imageObj = cv2.imread(filePath)
cv2.imshow('image', imageObj)
pdfReader = PyPDF2.PdfFileReader(filePath, "rb")
pdfLength = pdfReader.getNumPages()
pageObj = pdfReader.getPage(0)
print(pageObj)
doc = minecart.Document(open(filePath, 'rb'))
i = 0
pageList = []
while i <= pdfLength:
    page = doc.get_page(i)
    pageList.append(page)
    i = i + 1
        'Year': Year,
        'Location': Location,
        'Website': Website,
        'Category': Category,
        'Ownership': Ownership,
        'Keypeople': Keypeople
    }


if __name__ == '__main__':
    name_box = (0, 688, 288, 835)
    description_investor_box = (30, 30, 300, 376)
    glance_box = (288, 41, 576, 376)

    file = open("Fintech100-12-111.pdf", 'rb')
    doc = minecart.Document(file)

    Company100Information = []
    a = 0
    while a < 100:
        page = doc.get_page(a)
        if a < 50:
            name = get50_leading_company_name("".join(
                page.letterings.iter_in_bbox(name_box)))
        else:
            name = get50_emerging_company_name("".join(
                page.letterings.iter_in_bbox(name_box)))
        description_investor = get_description_investors("".join(
            page.letterings.iter_in_bbox(description_investor_box)))
        other_information = get_other_information("".join(
            page.letterings.iter_in_bbox(glance_box)))