Beispiel #1
0
def write_crops(file_name,
                cropped_tables=None,
                cropped_text=None,
                temp_table_path=TABLE_FOLDER,
                temp_text_path=TEXT_FOLDER,
                page_number=None):
    """
    Writes table and text images under table and text folder

    :param file_name:
    :param cropped_tables: list of pillow images
    :param cropped_text: list of pillow images
    :param temp_table_path:
    :param temp_text_path:
    :return: None
    """
    i = 0
    logger.info('Writing cropped tables...')
    table_paths = []
    text_path = None
    if cropped_tables is not None:
        for ct in cropped_tables:
            new_file_path = \
                os.path.join(temp_table_path, str(file_name),
                             'table_pag_{pag_num}_{c}.jpeg'.format(pag_num=page_number, c=i))
            ct = ct.convert('L')
            logger.info('Deskewing table...')
            sd = deskew.Deskew(input_numpy=np.asarray(ct), output_numpy=True)
            de_skewed_image_np = sd.run()
            logger.info('Deskew done')
            ct = Image.fromarray(de_skewed_image_np)
            ct = ct.convert(mode='L')
            try:
                ct.save(new_file_path, dpi=(EXTRACTION_DPI, EXTRACTION_DPI))
                logger.info('Image_{} wrote on disk'.format(new_file_path))
            except IOError or ValueError as e:
                raise OutputError('Cannot write image on disk: \n{}'.format(e))
            i += 1
            table_paths.append(new_file_path)
        logger.info('Writing cropped tables done.')
    else:
        logger.info('No tables to write on disk')

    if cropped_text is not None:
        logger.info('Writing cropped text...')
        # for cl in cropped_text:
        new_file_path = os.path.join(temp_text_path, str(file_name),
                                     'text_pag_{}.jpeg'.format(page_number))
        # ct_l = cl.convert('L')
        try:
            cropped_text.save(new_file_path,
                              dpi=(EXTRACTION_DPI, EXTRACTION_DPI))
            logger.info('Image_{} wrote on disk'.format(new_file_path))
        except IOError or ValueError as e:
            raise OutputError('Cannot write image on disk: \n{}'.format(e))
        # i += 1
        logger.info('Writing cropped text done.')
        text_path = new_file_path
    return table_paths, text_path
def write_image_on_disk(file_name, pil_image, page=0, path=PATH_TO_EXTRACTED_IMAGES):
    """
    Writes image on disk
    :param file_name: name of original file
    :param pil_image: numpy array greyscale image
    :param page: page counter from upward function.
    :param path: path/to/folder where to write images
    :return:
    """
    logger.info('Writing temp images on disk...')
    path_to_image = os.path.join(path, '{fn}_page_{c}.jpeg'.format(fn=file_name, c=page))
    try:
        pil_image.save(path_to_image, dpi=(EXTRACTION_DPI, EXTRACTION_DPI))
        logger.info('Image_{} wrote on disk'.format(page))
    except IOError or ValueError as e:
        raise OutputError(
            message='Cannot write image on disk: \n{}'.format(e)
        )
def do_tesseract_on_tables(table_path, destination_pdf_path=TABLE_FOLDER):
    """

    :param table_path:
    :param destination_pdf_path:
    :return:
    """
    # takes only file name without extension
    input_file_name = os.path.basename(table_path).split(os.extsep)[0]
    # take the name of the folder in which the images are stored, that is the name of the original pdf
    pdf_name = os.path.dirname(table_path).split(os.path.sep)[-1]
    # checking if file exists
    if not os.path.isfile(table_path):
        raise InputError('{} not found'.format(table_path))
    input_file = table_path
    # Define config parameters.
    # '-l eng'  for using the English language
    # '--oem 1' for using LSTM OCR Engine
    # --oem 2 for using Legacy + LSTM engines NOT AVAILABLE IN ITALIAN
    # '--psm 12' for sparse text with OSD. 3 is default and it's not working bad.

    config = '-l ita --oem 1 --psm 12 pdf'
    config_list = config.split(' ')  # make a list of parameters

    args = [
        "tesseract",
        input_file,  # actual file to be analyzed
        input_file_name,  # output file name
        *config_list,  # extract all parameters inside this array
        'pdf'
    ]
    start_folder = os.path.join(destination_pdf_path, pdf_name)
    proc = Popen(
        args,
        stdin=PIPE,
        stdout=PIPE,
        stderr=STDOUT,
        cwd=start_folder
    )
    # do the actual job on CL
    output, outerr = proc.communicate()

    if proc.returncode == 0:
        # Everything went well
        logger.info("pdf was successfully extracted from: {}".format(input_file_name))
        logger.info('Tesseract output: {}'.format(output))

        # now extracting tables with tabula
        tabula_input_path = os.path.join(destination_pdf_path, pdf_name, '{}.pdf'.format(input_file_name))
        if not os.path.isfile(tabula_input_path):
            raise InputError('{} was not found. Maybe was not created?'.format(tabula_input_path))
        tabula_output_path = os.path.join(destination_pdf_path, pdf_name, '{}.csv'.format(input_file_name))
        try:
            tabula.convert_into(
                tabula_input_path,
                output_path=tabula_output_path,
                output_format='csv',
                pages='all'
            )
        except Exception as e:
            raise OutputError('Tabula is not performing well...\n{}'.format(e))

    else:
        # something went wrong
        logger.error("Error while extracting pdf from {}".format(input_file))
        logger.error("Tesseract Output: {}".format(output))
        raise OutputError('Tesseract is not performing well...\n{}'
                          .format("Tesseract Error: {}".format(outerr)))
def from_pdf_to_pil_generator(file_path,
                              temp_folder=TEMP_IMG_FOLDER_FROM_PDF,
                              thread_name=None):
    """
    Create a page generator from pdf to make it load less RAM as it takes one page at a once. It read a page at once from
    pdf, then acquire it in RAM and offer as generator.
    It temporarly write the image in temp_folder, then it delete it automatically
    :param file_path: path/to/file.pdf
    :param thread_name: name of the thread in case of batch process
    :param temp_folder: path/to/folder to store temp image before acquiring it in RAM
    :return: PIL generator. Return None if nothing is found
    """

    if not os.path.isfile(file_path):
        raise InputError(message='{} not found'.format(file_path))
    else:
        page = 1
        # logger.info("Creating page generator from {path}...".format(path=file_path))
        if not os.path.isdir(temp_folder):
            try:
                os.makedirs(temp_folder)
                logger.info('Temp folder for extraction written on disk')
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise OutputError(message=exc)
                else:
                    logger.info(
                        '{} already exists. No need to create it'.format(
                            temp_folder))
        # Extract one page at a once. The iterator goes from first page to last until it reaches the end. In that case a
        # StopIteraton is raised.
        # Uses pdftoppm
        while True:

            args = [
                "pdftoppm", "-l",
                str(page), "-f",
                str(page), "-r",
                str(EXTRACTION_DPI), "-gray", file_path,
                os.path.join(temp_folder, "temp-{}".format(thread_name))
            ]

            # args.append(item for item in config_list)

            proc = Popen(
                args,
                stdin=PIPE,
                stdout=PIPE,
                stderr=STDOUT,
                # cwd=os.path.join(temp_folder)
            )
            output, outerr = proc.communicate()

            if proc.returncode == 0:
                # Everything went well
                logger.info("Page {} successfully extracted".format(page))
                # checking if the number of pages goes up to 999 pages. In the case that the number of pages is > 10,
                # the temp file number of the first page will be 01 instead of 1. If num_pages > 100, then 001 instead of 1.
                # here we check if temp file exists, if not we check the 01 one and so on.
                fp = os.path.join(
                    temp_folder, 'temp-{tn}-{n}.pgm'.format(n=page,
                                                            tn=thread_name))
                if page < 10:
                    if not os.path.isfile(fp):
                        fp = os.path.join(
                            temp_folder,
                            'temp-{tn}-0{n}.pgm'.format(n=page,
                                                        tn=thread_name))
                        if not os.path.isfile(fp):
                            fp = os.path.join(
                                temp_folder,
                                'temp-{tn}-00{n}.pgm'.format(n=page,
                                                             tn=thread_name))

                elif 11 <= page <= 100:
                    if not os.path.isfile(fp):
                        fp = os.path.join(
                            temp_folder,
                            'temp-{tn}-0{n}.pgm'.format(n=page,
                                                        tn=thread_name))

                try:
                    img = Image.open(fp)
                    # explicit copy of image so we can delete it from disk safely
                    img = copy.deepcopy(img)
                    if os.path.exists(fp):
                        os.remove(fp)
                    # convert image to greyscale mode
                    img.convert(mode='L')
                    page += 1
                    # return it as a generator
                    yield img
                    # return img
                except FileNotFoundError as e:
                    raise InputError(message=e)

            # case mostly used for stopping iteration when EOF
            else:
                if outerr is None:
                    logger.warning('pdftoppm output: {}'.format(output))
                    logger.warning('Probably reached end of file.')
                    raise StopIteration
                else:
                    logger.error('Something went wrong...')
                    logger.error('pdftoppm output: {}'.format(output))
                    raise InputError(
                        message='pdftoppm error: {}'.format(outerr))
Beispiel #5
0
def create_temp_folders(file_name,
                        temp_table_folder=TABLE_FOLDER,
                        temp_text_folder=TEXT_FOLDER):
    """
    Clear any existing table/file_name and text/file_name folder for creating new images

    :param file_name:
    :param temp_table_folder:
    :param temp_text_folder:
    :return: None
    """
    logger.info('Clear and create temp file for images from pdf')
    if not os.path.isdir(temp_table_folder):
        # creates folder for table images per page
        try:
            os.makedirs(temp_table_folder)
            logger.info('{} created successfully'.format(temp_table_folder))
        except OSError as exc:
            if exc.errno != errno.EEXIST:
                raise OutputError(
                    '{} was not created correctly.'.format(temp_table_folder))
            else:
                logger.info('{} already present'.format(temp_table_folder))

    # creates folder for text images per page
    logger.info(temp_text_folder + ' folder created successfully')
    if not os.path.isdir(temp_text_folder):
        try:
            os.makedirs(temp_text_folder)
        except OSError as exc:
            if exc.errno != errno.EEXIST:
                raise OutputError(
                    '{} was not created correctly.'.format(temp_text_folder))
            else:
                logger.info('{} already present'.format(temp_text_folder))

    if os.path.isdir(os.path.join(temp_table_folder, str(file_name))):
        logger.info('Clearing table temp folder from existing files...')
        # shutil.rmtree(os.path.join(temp_table_folder, str(file_name)), ignore_errors=True)
        logger.info('Clear done')
    if os.path.isdir(os.path.join(temp_text_folder, str(file_name))):
        logger.info('Clearing text temp folder from existing files...')
        # shutil.rmtree(os.path.join(temp_text_folder, str(file_name)), ignore_errors=True)
        logger.info('Clear done')

    try:
        logger.info('Creating {}...'.format(temp_table_folder))
        os.makedirs(os.path.join(temp_table_folder, str(file_name)))
        logger.info(temp_table_folder + ' created')
    except OSError as exc:  # Guard against race condition
        if exc.errno != errno.EEXIST:
            raise OutputError('{} was not created.'.format(temp_table_folder))
        else:
            logger.info('{} already present'.format(temp_table_folder))

    try:
        logger.info('Creating {}...'.format(temp_text_folder))
        os.makedirs(os.path.join(temp_text_folder, str(file_name)))
        logger.info(temp_text_folder + ' created')
    except OSError as exc:  # Guard against race condition
        if exc.errno != errno.EEXIST:
            raise OutputError('{} was not created'.format(temp_text_folder))
        else:
            logger.info('{} already present'.format(temp_text_folder))