Ejemplos de image_to_pdf_or_hocr en Python, ejemplos de pytesseract.image_to_pdf_or_hocr en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: words.py Proyecto: meghashyamk4/Text-extraction-from-Image

def find_word_in_image():
    print('Enter the image path : ')
    img = input()
    print('Enter the word to search : ')
    key = input()
    pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
    img = cv2.imread(img)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    print('Checking...')
    pytesseract.image_to_pdf_or_hocr(img, extension='pdf')
    boxes = pytesseract.image_to_data(img)
    f = 0
    for a, b in enumerate(boxes.splitlines()):
        if a != 0:
            b = b.split()
            if len(b) == 12 and srch(b[11].lower(), key.lower()):
                x, y, w, h = int(b[6]), int(b[7]), int(b[8]), int(b[9])
                cv2.putText(img, b[11], (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX,
                            1, (50, 50, 255), 2)
                cv2.rectangle(img, (x, y), (x + w, y + h), (50, 50, 255), 2)
                f = 1
    if f == 0:
        print('Word not found')
    else:
        print('Word found !')
    size = (1000, 800)
    cv2.imshow('img', cv2.resize(img, size))
    cv2.waitKey(0)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: checkLetters.py Proyecto: PublicMakings/openCVexperiments

def tesseract():
    # If you don't have tesseract executable in your PATH, include the following:
    #pytesseract.pytesseract.tesseract_cmd = r'<full_path_to_your_tesseract_exec$
    # Example tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract'
    
    # Simple image to string
    
    #.encode('utf8') #add to print statement to make encoding work
    
    print(pytesseract.image_to_string(Image.open(imgFile)).encode('utf8'))
    
    # Get bounding box estimates
    print(pytesseract.image_to_boxes(Image.open(imgFile)))
    
    # Get verbose data including boxes, confidences, line and page numbers
    print(pytesseract.image_to_data(Image.open(imgFile)))
    
    # Get information about orientation and script detection
    print(pytesseract.image_to_osd(Image.open(imgFile)))
    
    # In order to bypass the internal image conversions, just use relative or ab$
    # NOTE: If you don't use supported images, tesseract will return error
    print(pytesseract.image_to_string(imgFile))
    
    # get a searchable PDF
    pdf = pytesseract.image_to_pdf_or_hocr(imgFile, extension='pdf')
    
    # get HOCR output
    hocr = pytesseract.image_to_pdf_or_hocr(imgFile, extension='hocr')

Ejemplo n.º 3

0

Mostrar archivo

Archivo: tesseract.py Proyecto: allanfs1/OP_-_-__Projeto_OpenCV

 def tesseract_pdf(self, ops):
     if ops is 0:
         # get a searchable PDF
         pdf = pytesseract.image_to_pdf_or_hocr('Placa.jpg',
                                                extension='pdf')
     else:
         # get HOCR output
         hocr = pytesseract.image_to_pdf_or_hocr('test.png',
                                                 extension='hocr')

Ejemplo n.º 4

0

Mostrar archivo

Archivo: server.py Proyecto: RobinRojowiec/ocr-api

def extract_text(file: UploadFile = File(...),
                 lang: str = "eng",
                 text_only: bool = False,
                 custom_config: str = None):
    """
    :param file:
    :param lang: available: deu, eng
    :return:
    """
    filepath = "temp/" + file.filename
    with file.file:
        with open(filepath, "wb") as temp_file:
            temp_file.write(file.file.read())

    # preprocess_image(filepath)
    if custom_config is None:
        custom_config = '--oem 3'

    if text_only:
        output = bytes(pytesseract.image_to_string(filepath,
                                                   lang=lang,
                                                   config=custom_config),
                       encoding="utf-8")
        response = PlainTextResponse(content=output)
    else:
        output = pytesseract.image_to_pdf_or_hocr(filepath,
                                                  lang=lang,
                                                  extension='hocr',
                                                  config=custom_config)
        extracted = xmltodict.parse(output)
        response = hocr_to_simple_json(extracted, lang)

    os.remove(filepath)
    return response

Ejemplo n.º 5

0

Mostrar archivo

Archivo: to_text.py Proyecto: ml-lab/ocrized-text-dataset

def to_text(filename, lang="eng", format_="txt", ignore_error=False):
    """ Extract text from an image using Tesseract OCR.

    Arguments:
        filename {str} -- path to an image

    Keyword Arguments:
        lang {str} -- Tesseract OCR language option (default: "eng")
        format_ {str} -- Tesseract output format (.hocr or .txt, default: "txt")
        ignore_error {bool} -- catch and ignore all exceptions (default: False)
    """
    try:
        im = Image.open(filename)
        basename, ext = os.path.splitext(filename)
        target = basename + "." + format_
        if format_ == "txt":
            tess_output = pytesseract.image_to_string(im, lang=lang, config="--psm 3 --oem 1")
        elif format_ == "hocr":
            tess_output = pytesseract.image_to_pdf_or_hocr(
                im, lang=lang, config="--psm 3 --oem 1", extension=format_
            )
        with open(target, "w") as fp:
            fp.write(str(tess_output))
    except Exception as e:
        if ignore_error:
            print("Error: {}".format(e))
        else:
            raise e

Ejemplo n.º 6

0

Mostrar archivo

    def _extract(self):
        path = self.paths[self.pathidx]

        self.logger.clear()
        self.logger.log("Extracting information from '{}'...\n".format(path))

        temp = None
        if path.split('.')[-1].lower() in ['jpg', 'png']:
            image = Image.open(path)
            pdf = pytesseract.image_to_pdf_or_hocr(image, extension='pdf')
            temp = tempfile.NamedTemporaryFile(suffix='.pdf')
            temp.write(pdf)
            path = temp.name

        predictions = {}
        for key in FIELDS:
            if self.checkboxes[key].get():
                model = AttendCopyParse(field=key, restore=True)
                predictions[key] = model.predict(paths=[path])[0]

        if temp is not None:
            temp.close()

        self.viewer.label(labels=predictions)
        self.logger.log(simplejson.dumps(predictions, indent=2,
                                         sort_keys=True))
        self.start_button.configure(state='normal')
        self.running = False

Ejemplo n.º 7

0

Mostrar archivo

 def _load_file(self):
     self.viewer.clear()
     path = self.paths[self.pathidx]
     filename = os.path.basename(path)
     try:
         if filename.split('.')[-1].lower() in ['jpg', 'png']:
             image = Image.open(path)
             pdf = io.BytesIO(
                 pytesseract.image_to_pdf_or_hocr(image, extension='pdf'))
             self.pdf = pdfplumber.load(pdf)
         else:
             self.pdf = pdfplumber.open(path)
         self.viewer.display_pdf(self.pdf)
         self.doc_label.configure(
             text="{} of {}".format(self.pathidx + 1, len(self.paths)))
         self.logger.clear()
         self.logger.log("Showing invoice '{}'".format(path))
     except WandException:
         result = messagebox.askokcancel(
             "Error",
             "ImageMagick Policy Error! Should InvoiceNet try to fix the error?"
         )
         if result:
             result = self._fix_policy_error()
         if result:
             messagebox.showinfo(
                 "Policy Fixed!",
                 "ImageMagick Policy Error fixed! Restart InvoiceNet.")
         else:
             messagebox.showerror(
                 "ImageMagick Policy Error",
                 "Coud not fix ImageMagick policy. Rejecting the current pdf file!"
             )
     except (IndexError, IOError, TypeError):
         pass

Ejemplo n.º 8

0

Mostrar archivo

Archivo: simpleOCR.py Proyecto: AndyFromTaiwan/simpleOCR

def run_ocr(f):
    (filename, extension) = splitext(f)
    # Skips non-image files
    if extension == '':
        return

    print('Starts an OCR on', f)
    try:
        img = Image.open(join(img_path, f))

        # Simple image to string
        if GEN_TXT:
            content = pytesseract.image_to_string(img, lang=language)
            txt = join(txt_path, filename + '.txt')
            with open(txt, 'w') as tf:
                tf.write(content)
            print('Successfully wrote the extracted text in', txt)

        # Get a searchable PDF
        if GEN_PDF:
            content = pytesseract.image_to_pdf_or_hocr(img,
                                                       lang=language,
                                                       extension='pdf')
            pdf = join(pdf_path, filename + '.pdf')
            with open(pdf, 'w+b') as pf:
                pf.write(content)  # pdf type is bytes by default
            print('Successfully wrote the converted pdf in', pdf)

    except Exception as e:
        print("Fail!", e.__class__, "occurred:")
        print(e)
    print()

Ejemplo n.º 9

0

Mostrar archivo

    def createPdfFromImages(self, base_source_folder, parent_folder,
                            output_folder):
        absolute_parent_folder_path = os.path.join(base_source_folder,
                                                   parent_folder)
        images = FileHelper.getAllFilesInFolder(absolute_parent_folder_path)
        pdf_path = os.path.join(output_folder, parent_folder)  # + ".pdf"

        for image in images:
            # Read image from disk
            absolute_image_path = os.path.join(absolute_parent_folder_path,
                                               image)

            im = cv2.imread(absolute_image_path, cv2.IMREAD_COLOR)

            #define tesseract config
            config = ('-l eng --oem 1 --psm 3')

            # Run tesseract OCR on image
            #text = pytesseract.image_to_string(im, config=config)

            # create PDF from tesseract OCR results
            pdf = pytesseract.image_to_pdf_or_hocr(im, extension='pdf')
            #f = open("data\\images\\eu-009-page-001.pdf", "w+b")
            f = open(
                os.path.join(output_folder,
                             os.path.splitext(image)[0]) + ".pdf", "w+b")
            f.write(bytearray(pdf))
            f.close()

Ejemplo n.º 10

0

Mostrar archivo

def ocr(inPath, copyPath):
    outPath = 'ocr/output/receipt.jpg'
    pdfPath = 'ocr/output/output.pdf'

    process.transform(inPath, outPath)
    # load the example image and convert it to grayscale
    img = cv2.imread(outPath)

    # write the grayscale image to disk as a temporary file so we can
    # apply OCR to it
    # psm 4: Assume a single column of text of variable sizes.
    text = pytesseract.image_to_string(img,
                                       lang='eng',
                                       config='--psm 4 --oem 1')
    # os.remove(filename)
    # print(text)
    result = textAnalysis(text, copyPath)
    # Generate a PDF
    pdf = pytesseract.image_to_pdf_or_hocr(img, extension='pdf')
    try:
        f = open(pdfPath, 'w')
        f.write(pdf)
        f.close()
    except IOError, e:
        print 'IOError: ', e

Ejemplo n.º 11

0

Mostrar archivo

Archivo: pdf_parser.py Proyecto: boxorange/scientific-literature-mining

    def parse_PDF_by_Tesseract(self, pdf_file):
        """
		To use Tesseract, PDF first needs to be converted into image files.
		'pdf2image' is based on 'Poppler' package.

		list of languages: https://github.com/tesseract-ocr/langdata
		
		- Adding a parameter <lang='eng'> doesn't improve the performance (speed). It's because default lang is English.
		- dpi=600 takes more processing time than dpi=300, but no performance improvement was found.

		E.g., pytesseract.image_to_string(image, lang='eng', boxes=False, config='--psm 10 --oem 3 -c tessedit_char_whitelist=0123456789')

		Ref
		- https://www.pyimagesearch.com/2018/09/17/opencv-ocr-and-text-recognition-with-tesseract/
		"""

        images = convert_from_path(pdf_file,
                                   dpi=300,
                                   transparent=True,
                                   fmt='tiff',
                                   thread_count=3)
        # TODO: two give the same result, but test it with further examples. (when using encode() change 'w' to 'wb')
        #text = '\n'.join(pytesseract.image_to_string(img) for img in images)
        #text = '\n'.join(pytesseract.image_to_string(img) for img in images).encode('utf-8')

        #with open('tesseract_test.txt', 'w') as file:
        #    file.write(text)

        hocr = pytesseract.image_to_pdf_or_hocr(images[5], extension='hocr')
        with open('tesseract_test.html', 'wb') as file:
            file.write(hocr)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: ocr_converter.py Proyecto: MadhanArts/PythonInternshipBestEnlist

def convert_To_PDF(input_files):
    # Creating pdffilemerger object
    pdfMerger = PyPDF2.PdfFileMerger()

    for file in input_files:
        # Creating image object
        image = cv2.imread(file)

        # Converting the image to ocr pdf
        result = pytesseract.image_to_pdf_or_hocr(image, lang='eng')

        # Writing the ocr converted result to a temp.pdf file
        with open("temp.pdf", 'wb') as current_page_file:
            current_page_file.write(bytearray(result))

        # Appending the temp.pdf file to pdfmerger object
        pdfMerger.append(PyPDF2.PdfFileReader('temp.pdf'), 'rb')

    # Deleting temp.pdf file
    os.remove('temp.pdf')

    # Saving merged pdf object as pdf file
    output_file_location = "output/my_output.pdf"
    output_file = open(output_file_location, 'wb')
    pdfMerger.write(output_file)
    output_file.close()
    pdfMerger.close()
    return os.path.abspath(output_file_location)

Ejemplo n.º 13

0

Mostrar archivo

Archivo: payroll-to-excel.py Proyecto: alan-turing-institute/payroll-report

def get_text_from_image_pdf(folder, file):

    output_pdf_file = os.path.join(folder, "text_output.pdf")

    # Convert the pdf to a png image
    pdf2image.convert_from_path(os.path.join(folder, file),
                                fmt="png",
                                dpi=400,
                                single_file=True,
                                output_folder=args.folder,
                                output_file="tmp")

    # Generate a pdf with selectable text based on the image
    pdf = pytesseract.image_to_pdf_or_hocr(os.path.join(folder, "tmp.png"),
                                           extension="pdf")
    with open(output_pdf_file, "w+b") as f:
        f.write(pdf)

    # Extract the text from the pdf
    with open(output_pdf_file, "rb") as in_pdf:
        pdf = pdftotext.PDF(in_pdf)

    # Check that we've actually got some text out from the converted file
    text = pdf[0]
    if len(text) == 0:
        Exception("Could not extract text from converted pdf file")

    return text

Ejemplo n.º 14

0

Mostrar archivo

def extract_data_ocr(image):
    # extract text
    text = pytesseract.image_to_string(image)
    # extract pdf
    hocr = pytesseract.image_to_pdf_or_hocr(image, extension='hocr')
    # done
    return text, hocr

Ejemplo n.º 15

0

Mostrar archivo

Archivo: forwardgram.py Proyecto: arka0821/forwardgram

 async def handler(event):
     message_text = None
     text_in_image = None
     if event.message.media is not None:
         if is_image(event.message.media):
             await client.download_media(event.message.media,
                                         config["temp_path"] + 'temp.jpg')
             time.sleep(2)
             # Get HOCR output
             hocr = pytesseract.image_to_pdf_or_hocr(config["temp_path"] +
                                                     'temp.jpg',
                                                     extension='hocr')
             soup = BeautifulSoup(hocr.decode('utf-8'), 'html.parser')
             elements = soup.find_all("span", class_="ocrx_word")
             text = ''
             for elm in elements:
                 text += elm.text
             text_in_image = re.findall(r'[A-Z]{3}\s*/\s*[A-Z]{3}', text)
             if len(text_in_image) > 0:
                 text_in_image = "Symbol: " + text_in_image[0].replace(
                     '/', '').replace(" ", "")
                 message_from_sender = parese_message(event.message.message)
                 if message_from_sender is not None and text_in_image is not None:
                     message_text = text_in_image + "\n" + message_from_sender
                 elif text_in_image is None:
                     message_text = message_from_sender
                 elif message_from_sender is None:
                     message_text = text_in_image
                 await client.send_message(output_channel_entity,
                                           message_text)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: main.py Proyecto: tanaypatankar/Digitize-Reports

def result_coords(im):
    string = ''
    hocr = pytesseract.image_to_pdf_or_hocr(im, extension='hocr')
    with open('tanayhocr.txt', "w+b") as f:
        f.write(bytearray(hocr))

    with open('tanayhocr.txt', 'r') as f:
        string += f.read()

    keywords = ['Result', 'RESULT', 'Value', 'VALUE']
    for word in keywords:
        result = 0
        result = string.find('>' + word, 0, len(string))
        # print(word, result)
        if result == -1:
            continue
        break
    i = 0
    for x in range(result, 0, -1):
        if string[x] == 'x':
            i += 1
            if i == 2:
                break

    ans = string[x + 2:result].split(';')[0]
    ans = ans.split(' ')
    return int(ans[0]), int(ans[1]), int(ans[2]), int(ans[3])

Ejemplo n.º 17

0

Mostrar archivo

    def _run_ocr(self):
        if self.pdf is None:
            return
        pdf_pages = list()
        for page in self.pdf.pages:
            image = page.to_image(resolution=100)
            pdf = pytesseract.image_to_pdf_or_hocr(image.original,
                                                   extension='pdf')
            pdf_pages.append(pdf)

        pdf_writer = PyPDF2.PdfFileWriter()
        for page in pdf_pages:
            pdf = PyPDF2.PdfFileReader(io.BytesIO(page))
            pdf_writer.addPage(pdf.getPage(0))

        dirname = os.path.dirname(self.paths[self.pathidx])
        filename = os.path.basename(self.paths[self.pathidx])

        path = filedialog.asksaveasfilename(title='Save OCR As',
                                            defaultextension='.pdf',
                                            initialdir=dirname,
                                            initialfile=filename,
                                            filetypes=[('PDF files', '*.pdf'),
                                                       ('all files', '.*')])
        if path == '' or path is None:
            return

        with open(path, 'wb') as out:
            pdf_writer.write(out)

        self.paths[self.pathidx] = path
        self._load_file()

Ejemplo n.º 18

0

Mostrar archivo

 def _append_pdf_page(self, image):
     # Convert screen cap into correct color space
     img_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
     # Generate OCR pdf from image
     pdf = pytesseract.image_to_pdf_or_hocr(
         img_rgb, lang="eng", extension="pdf", nice=0
     )
     self.pdf_writer.appendPagesFromReader(PyPDF2.PdfFileReader(io.BytesIO(pdf)))

Ejemplo n.º 19

0

Mostrar archivo

 def pdfGenerator(self, imagePath, name, extension, pdfPath):
     try:
         pdf = pytesseract.image_to_pdf_or_hocr(imagePath+name+extension, extension='pdf')
         f = open(pdfPath+name+".pdf", "w+b")
         f.write(bytearray(pdf))
         f.close()
     except Exception as e:
         print(e)

Ejemplo n.º 20

0

Mostrar archivo

Archivo: convert_pdf_to_text_nepali.py Proyecto: theonlyNischal/Deep-Learning-Projects-and-Paper-Implementation

def get_searchable_pdf(pdf_file_path):
    # Todo: Check implementation
    images = convert_pdf_to_imgs(pdf_file_path)
    with open('test.pdf', 'w+b') as f:
        for pg, img in enumerate(images):
            pdf = pytesseract.image_to_pdf_or_hocr(img, extension='pdf')
            f.write(pdf)  # pdf type is bytes by default
    return pdf

Ejemplo n.º 21

0

Mostrar archivo

Archivo: image.py Proyecto: alexzanderr/python372

def save_pdf_from_image(path: str, dst_folder: str):
    """ saves pdf from image to dst specified by user """
    image_name = path.split("\\")[-1].split(".")[0]
    binary_pdf = pytesseract.image_to_pdf_or_hocr(Image.open(path))

    with open(dst_folder + "\\{}.pdf".format(image_name), "wb") as bin_file:
        bin_file.truncate(0)
        bin_file.write(binary_pdf)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: common.py Proyecto: ssundaram21/computation_hist

def make_searchable_pdf(input_path, output_file_name=None):
    """
    This function takes in the file path to a document and return a searchable pdf
    using Tesseract through the pytesseract module

    *WARNING* Once a file is passed through this method while output_file=None, it will be
    overwritten with the Tesseract searchable PDF.

    Example to avoid creating files:

                # External PDF called test.pdf
                make_searchable(path/to/test.pdf, 'output_name')

    :param input_path: the filepath to be converted
    :param output_file_name: the file name that you wish to save the document under (without
    extensions)
    :return: output_file path
    """

    # Keeps track of output filepath, and creates an empty list for creating dummy filepaths
    if isinstance(input_path, str):
        input_path = Path(input_path)
    if output_file_name is None:
        output_file = input_path
        output_file_name = output_file.stem
    else:
        output_file = Path(input_path.parent, output_file_name + '.pdf')

    file_paths = []

    # Converts pdf into a list of PIL files
    image = pdf2image.convert_from_path(input_path, fmt='jpg')

    # Converts the PIL files into binaries and saves them in a list, along with the filepaths
    pages = []
    for i in range(len(image)):
        single_page = pytesseract.image_to_pdf_or_hocr(image[i],
                                                       extension='pdf')
        pages.append(single_page)
        file_paths.append(
            Path(output_file.parent, output_file_name + '_' + str(i) + '.pdf'))

    # Creates dummy pdf documents that will be merged
    for i, page in enumerate(pages):
        with open(file_paths[i], 'wb') as f:
            f.write(page)

    # Merges the pdf files in python
    merger = PyPDF2.PdfFileMerger()
    for path in file_paths:
        merger.append(str(path))

    # Writes the merged file into one document and then deletes dummy files
    merger.write(str(output_file))
    for path in file_paths:
        path.unlink()

    return output_file

Ejemplo n.º 23

0

Mostrar archivo

Archivo: tasks.py Proyecto: dodziraynard/digitaleye

def convert_images_to_pdf(self, images, output_dir):
    pdfs = []
    for page_number, image in enumerate(images):
        pdf = pytesseract.image_to_pdf_or_hocr(Image.open(image))
        filename = output_dir + "/page_" + str(page_number) + ".pdf"
        with open(filename, "wb") as file:
            file.write(pdf)
            pdfs.append(filename)
    return pdfs

Ejemplo n.º 24

0

Mostrar archivo

Archivo: server.py Proyecto: ompedans/annotasyon

def _getpageobj(fp):
    np = toBinaryImage(fp)
    hocr_str = pytesseract.image_to_pdf_or_hocr(Image.open(np),
                                                extension="hocr",
                                                lang="fas")
    return {
        "data_url": img_to_data(np),
        "hocr": str(hocr_str, encoding="utf-8")
    }

Ejemplo n.º 25

0

Mostrar archivo

Archivo: it.py Proyecto: GiovaniCenta/ImageToTextorPdf

 def imgtoPDF(self, path):
     # CORRIGIR QUANDO NÃO SELECIONA IMAGEM
     path = self.clickSelectfile()
     img = cv2.imread(path)
     img = cv2.cvtColor(
         img, cv2.COLOR_BGR2RGB
     )  # need to encode the image to RGB, it is how opencv works
     pdf = pytesseract.image_to_pdf_or_hocr(img, extension='pdf')
     with open('test.pdf', 'w+b') as f:
         f.write(pdf)

Ejemplo n.º 26

0

Mostrar archivo

 def ocr(i):
     pdf = pytesseract.image_to_pdf_or_hocr(tempfolder + i, extension='pdf')
     count.inc()
     with open(tempfolder + i[-9:-4] + '.pdf', 'w+b') as f:
         f.write(pdf)
     try:
         k = progress.index(count.val)
         print(f"Finished {int(count/total*100)}%: {count.val} pages")
     except:
         pass

Ejemplo n.º 27

0

Mostrar archivo

Archivo: tesseract_ocr.py Proyecto: saahithhegde/invoice_ocr

 def generate_simple_pdf(self, image, filename):
     PDF = pytesseract.image_to_pdf_or_hocr(image,
                                            lang='eng',
                                            config='',
                                            nice=0,
                                            extension='pdf')
     f = open("processeddata" + filename + "_simple.pdf", "w+b")
     f.write(bytearray(PDF))
     f.close()
     return

Ejemplo n.º 28

0

Mostrar archivo

Archivo: tesseract_ocr.py Proyecto: saahithhegde/invoice_ocr

 def generate_html(self, image, filename):
     content = pytesseract.image_to_pdf_or_hocr(image,
                                                lang='eng',
                                                nice=0,
                                                extension='hocr')
     # Write content to a new file, owerwrite w or append a (b=binary)
     f = open("processeddata" + filename + ".html", 'w+b')
     f.write(bytearray(content))
     f.close()
     return

Ejemplo n.º 29

0

Mostrar archivo

Archivo: main.py Proyecto: Sleeck/docker-tesseract

def get_ocr_pdf():
    """
    OCR image to pdf
    ---
    tags:
      - ocr
    parameters:
      - in: formData
        name: image
        type: file
        required: true
        description: Image file
      - in: formData
        name: lang
        type: string
        required: true
        description: Language
      - in: formData
        name: timeout
        type: int
        required: false
        description: Timeout
    responses:
      500:
        description: Error message
      200:
        description: Pdf file
    """

    if 'image' not in request.files:
        return "No file uploaded"

    image_file = request.files['image']

    if image_file.filename == '':
        return "No file name"

    if request.form['lang'] == '':
        return "No language defined"

    file = NamedTemporaryFile()
    image_file.save(file.name)

    timeout = 300
    if "timeout" in request.form and int(request.form['timeout']) > 0:
        timeout = int(request.form['timeout'])

    pdf = pytesseract.image_to_pdf_or_hocr(file.name,
                                           lang=request.form['lang'],
                                           config='',
                                           nice=0,
                                           extension='pdf',
                                           timeout=timeout)

    return pdf

Ejemplo n.º 30

0

Mostrar archivo

Archivo: engine_main.py Proyecto: jannikwiessler/invoiceFeatureExtractionWin

    def __init__(self, invoiceData, destinationPath, templateFolerPath, pdfDummyPath):  
        #consturctor
        self.__writeExcelFlagg = 1 # this is for user interaction in upcomming versions 
        self.__destinationfile = destinationPath
        self.__invoiceData = invoiceData
        # default excel dictionary
        field_dic = {'Lieferdatum' : 'date_service',
                     'Bestelldatum' : 'date_order',
                     'Zulieferer' : 'issuer',
                     'Rechnungsnummer' : 'invoice_number',
                     'Betrag' : 'amount_sum',
                     'Netto' : 'amount_net',
                     'Template' : 'template'}
        
        templatefolder = templateFolerPath
        pdfDummy = pdfDummyPath
        
        for row, info in self.__invoiceData.iterrows():
            if info['valid'] != -1: # we have a vaild datatype
                if info['typ'] != 'pdf': # if ivvoice is not pdf: create pdf
                     logging.debug('egine_main: invoice is not pdf')
                     x = image_to_pdf_or_hocr(info.path,lang='deu')
                     f = open(pdfDummy, "w+b")
                     f.write(bytearray(x))
                     f.close()
                     pdf = pdfDummy   
                else: # invoice is pdf
                     logging.debug('egine_main: invoice is pdf')
                     pdf = info.path

                logging.debug('egine_main: starting extract_mydata_pdf with pdf: '+pdf)    
                text = extract_mydata_pdf(pdf) # get data from created pdf
                logging.debug('egine_main: extract_mydata_pdf completed')
                if text == None:
                    logging.debug('egine_main: extracted text is empty')
                logging.debug('egine_main: starting get_invoicedata')    
                data = get_invoicedata(text,templatefolder,field_dic) #get data from extracted text   
                if data != None:
                    logging.debug('egine_main: data != None')
                else: 
                    logging.debug('egine_main: data == None')
                logging.debug('egine_main: get_invoicedata completed')   
                #pprint.pprint(data)

                # do the xlsx wirting handle
                if self.__writeExcelFlagg == 1 and data != None:
                    logging.debug('egine_main: starting xlsWriting')
                    df = read_excel(self.__destinationfile)
                    logging.debug('egine_main: reading completed')
                    df = add_todataframe(df,field_dic,data)
                    logging.debug('egine_main: adding data completed')
                    df.to_excel(self.__destinationfile,index=False)
                    logging.debug('egine_main: xlsWriting completed')
                
            else: print(info['typ']+' is not supported')