Ejemplo n.º 1
0
def optimized_image2text(filename, lang='eng', verbose=False):

    ocr_txt = False

    ocr_temp_dirname = tempfile.mkdtemp(prefix="opensemanticetl_ocr_descew_")

    if verbose:
        print("Optimizing image {}".format(filename))

    # start external OCR Program
    result = subprocess.call(['scantailor-cli', filename, ocr_temp_dirname])

    if result == 0:

        images = os.listdir(ocr_temp_dirname)
        images.sort()

        for image in images:

            try:
                result = False

                imagefilename = ocr_temp_dirname + os.path.sep + image

                # ignore the cache directory of scantailor, only files in directory
                if os.path.isfile(imagefilename):

                    result = enhance_ocr.image2text(imagefilename,
                                                    lang,
                                                    verbose=verbose)

                    os.remove(imagefilename)

                    if result:

                        if ocr_txt:
                            ocr_txt = ocr_txt + '\n' + result
                        else:
                            ocr_txt = result

            except KeyboardInterrupt:
                raise KeyboardInterrupt
            except BaseException as e:
                sys.stderr.write(
                    "Exception while OCR descewed image of: {} - Maybe descewed image {} corrupt? Exception: {}\n"
                    .format(filename, imagefilename, e))

    else:
        sys.stderr.write(
            "Error: Descewing images for OCR failed for {} with return code {}"
            .format(filename, result))

    shutil.rmtree(ocr_temp_dirname)

    return ocr_txt
def optimized_image2text(filename, lang='eng', verbose=False):
	
	ocr_txt = False

	ocr_temp_dirname = tempfile.mkdtemp(prefix="opensemanticetl_ocr_descew_")

	if verbose:
		print ( "Optimizing image {}".format(filename) )
	
	# start external OCR Program
	result = subprocess.call(['scantailor-cli', filename, ocr_temp_dirname])
	
	if result == 0:

		images = os.listdir(ocr_temp_dirname)
		images.sort()

		for image in images:

			try:
				result = False

				imagefilename = ocr_temp_dirname + os.path.sep + image

				# ignore the cache directory of scantailor, only files in directory
				if os.path.isfile(imagefilename):

					result = enhance_ocr.image2text(imagefilename, lang, verbose=verbose)

					os.remove(imagefilename)
				
					if result:

						if ocr_txt:
							ocr_txt = ocr_txt + '\n' + result
						else:
							ocr_txt = result
		
			except KeyboardInterrupt:
				raise KeyboardInterrupt
			except BaseException as e:
				sys.stderr.write( "Exception while OCR descewed image of: {} - Maybe descewed image {} corrupt? Exception: {}\n" .format(filename, imagefilename, e) )



	else:
		sys.stderr.write ( "Error: Descewing images for OCR failed for {} with return code {}".format(filename, result) )

	shutil.rmtree(ocr_temp_dirname)
			
	return ocr_txt
def pdfimages2text(filename,
                   lang='eng',
                   verbose=False,
                   pdf_ocr=True,
                   pdf_ocr_descew=False):

    ocr_txt = {}
    ocr_descew_txt = {}

    ocr_temp_dirname = tempfile.mkdtemp(prefix="opensemanticetl_pdf_ocr_")

    # Extract all images of the pdf to tempdir with commandline tool "pdfimages" from poppler pdf toolbox
    # -j = export as JPEG
    # -p = write page name in image filename
    result = subprocess.call([
        'pdfimages', '-p', '-j', filename,
        ocr_temp_dirname + os.path.sep + 'image'
    ])

    if result == 0:

        images = os.listdir(ocr_temp_dirname)
        images.sort()

        for image in images:

            imagefilename = ocr_temp_dirname + os.path.sep + image

            if pdf_ocr:

                try:
                    # extract page number from extracted image filename (image-pagenumber-imagenumber.jpg)
                    pagenumber = int(image.split('-')[1])

                    result = enhance_ocr.image2text(filename=imagefilename,
                                                    lang=lang,
                                                    verbose=verbose)

                    if result:

                        if pagenumber in ocr_txt:
                            ocr_txt[pagenumber] += '\n' + result
                        else:
                            ocr_txt[pagenumber] = result

                except KeyboardInterrupt:
                    raise KeyboardInterrupt

                except BaseException as e:
                    sys.stderr.write(
                        "Exception while OCR of PDF: {} - maybe corrupt image: {} - exception: {}"
                        .format(filename, imagefilename, e))

            if pdf_ocr_descew:

                try:

                    # extract page number from extracted image filename (image-pagenumber-imagenumber.jpg)
                    pagenumber = int(image.split('-')[1])

                    result = enhance_ocr_descew.optimized_image2text(
                        imagefilename, lang, verbose=verbose)

                    if result:

                        if pagenumber in ocr_descew_txt:
                            ocr_descew_txt[pagenumber] += '\n\n' + result
                        else:
                            ocr_descew_txt[pagenumber] = result

                except KeyboardInterrupt:
                    raise KeyboardInterrupt

                except BaseException as e:

                    sys.stderr.write(
                        "Exception while optimized ocr pdf: {} - maybe corrupt image: {} - exception: {}"
                        .format(filename, imagefilename, e))

            os.remove(imagefilename)

        os.rmdir(ocr_temp_dirname)

    else:
        sys.stderr.write(
            "Error: Extracting images from PDF failed for {} {}".format(
                filename, result))

    return ocr_txt, ocr_descew_txt
def pdfimages2text(filename, lang='eng', verbose=False, pdf_ocr=True, pdf_ocr_descew=False):

	ocr_txt = {}
	ocr_descew_txt = {}

	ocr_temp_dirname = tempfile.mkdtemp(prefix="opensemanticetl_pdf_ocr_")
	
	# Extract all images of the pdf to tempdir with commandline tool "pdfimages" from poppler pdf toolbox
	# -j = export as JPEG
	# -p = write page name in image filename
	result = subprocess.call(['pdfimages', '-p' ,'-j', filename, ocr_temp_dirname + os.path.sep + 'image'])

	if result == 0:

		images = os.listdir(ocr_temp_dirname)
		images.sort()

		for image in images:

			imagefilename = ocr_temp_dirname + os.path.sep + image


			if pdf_ocr:

				try:
					# extract page number from extracted image filename (image-pagenumber-imagenumber.jpg)
					pagenumber = int( image.split('-')[1] )
	
					
					result = enhance_ocr.image2text(filename=imagefilename, lang=lang, verbose=verbose)
					
					if result:
						
						if pagenumber in ocr_txt:
							ocr_txt[pagenumber] += '\n' + result
						else:
							ocr_txt[pagenumber] = result

				except KeyboardInterrupt:
					raise KeyboardInterrupt

				except BaseException as e:
					sys.stderr.write( "Exception while OCR of PDF: {} - maybe corrupt image: {} - exception: {}".format(filename, imagefilename, e) )


			if pdf_ocr_descew:

				try:
	
					# extract page number from extracted image filename (image-pagenumber-imagenumber.jpg)
					pagenumber = int( image.split('-')[1] )

					result = enhance_ocr_descew.optimized_image2text(imagefilename, lang, verbose=verbose)
					
					if result:
	
						if pagenumber in ocr_descew_txt:
							ocr_descew_txt[pagenumber] += '\n\n' + result
						else:
							ocr_descew_txt[pagenumber] = result

				except KeyboardInterrupt:
					raise KeyboardInterrupt

				except BaseException as e:

					sys.stderr.write( "Exception while optimized ocr pdf: {} - maybe corrupt image: {} - exception: {}".format(filename, imagefilename, e) )

			os.remove(imagefilename)

		os.rmdir(ocr_temp_dirname)

	else:
		sys.stderr.write( "Error: Extracting images from PDF failed for {} {}".format(filename, result) )
		
	return ocr_txt, ocr_descew_txt
Ejemplo n.º 5
0
def pdfimages2text(filename,
                   lang='eng',
                   verbose=False,
                   pdf_ocr=True,
                   pdf_ocr_descew=False,
                   cache=None):
    ocr_txt = {}
    ocr_descew_txt = {}
    if cache is not None:
        try:
            return load_cache(filename, cache, lang, pdf_ocr, pdf_ocr_descew)
        except (FileNotFoundError, KeyError):
            if verbose:
                print('Not in OCR cache, starting OCR for {}'.format(filename))

    ocr_temp_dirname = tempfile.mkdtemp(prefix="opensemanticetl_pdf_ocr_")

    # Extract all images of the pdf to tempdir with commandline tool
    # "pdfimages" from poppler pdf toolbox
    # -j = export as JPEG
    # -p = write page name in image filename
    result = subprocess.call([
        'pdfimages', '-p', '-j', filename,
        ocr_temp_dirname + os.path.sep + 'image'
    ])

    if result != 0:
        sys.stderr.write(
            "Error: Extracting images from PDF failed for {} {}".format(
                filename, result))
        return {}, {}

    images = os.listdir(ocr_temp_dirname)
    images.sort()

    for image in images:

        imagefilename = ocr_temp_dirname + os.path.sep + image

        if pdf_ocr:

            try:
                result = enhance_ocr.image2text(filename=imagefilename,
                                                lang=lang,
                                                verbose=verbose)

                if result:
                    # extract page number from extracted image
                    # filename (image-pagenumber-imagenumber.jpg)
                    pagenumber = int(image.split('-')[1])

                    append_page(ocr_txt, pagenumber, result)
            except BaseException as e:
                sys.stderr.write(
                    "Exception while OCR of PDF: {} - "
                    "maybe corrupt image: {} - exception: {}\n".format(
                        filename, imagefilename, e))

        if pdf_ocr_descew:

            try:
                result = enhance_ocr_descew.optimized_image2text(
                    imagefilename, lang, verbose=verbose)

                if result:
                    # extract page number from extracted image
                    # filename (image-pagenumber-imagenumber.jpg)
                    pagenumber = int(image.split('-')[1])
                    append_page(enhance_ocr_descew, pagenumber, result)

            except BaseException as e:

                sys.stderr.write(
                    "Exception while optimized ocr pdf: {} - "
                    "maybe corrupt image: {} - exception: {}\n".format(
                        filename, imagefilename, e))

        os.remove(imagefilename)

    os.rmdir(ocr_temp_dirname)
    return ocr_txt, ocr_descew_txt