Python convert_from_path Examples, pdf2image.pdf2image.convert_from_path Python Examples

Example #1

0

Show file

File: utils.py Project: vikaschouhan/portfolio_analysis

def save_multiple_figs_to_image_file(fig_list, out_image, width=16, height=9, close_figs=True):
    tmp_pdf = '/tmp/______{}_tmp_pdf'.format(u4())
    save_figs_to_pdf(tmp_pdf, fig_list, width=width, height=height, close_figs=True)
    # Convert pdf to images
    image_list = [np.asarray(x) for x in pdf2image.convert_from_path(tmp_pdf)]
    os.remove(tmp_pdf)
    # Join images & save file
    join_images_vertical(image_list, out_image)

Example #2

0

Show file

    def extract_text(self):

        PDF_file = self.filename
        out_folder_name = os.path.basename(self.filename)

        if not os.path.exists(self.image_out_path):
            os.mkdir(self.image_out_path)

        if not os.path.exists(os.path.abspath(os.path.join(self.image_out_path,\
                out_folder_name))):
            os.mkdir(os.path.abspath(os.path.join(self.image_out_path,\
                out_folder_name)))

        index = 0
        maxPages = pdf2image._page_count(PDF_file)
        for page in range(0, maxPages, 10):
            pages = pdf2image.convert_from_path(PDF_file,
                                                dpi=200,
                                                first_page=page,
                                                last_page=min(
                                                    page + 10 - 1, maxPages))
            for tpage in pages:
                tpage.save(
                    os.path.abspath(
                        os.path.join(self.image_out_path, out_folder_name,
                                     str(index) + ".jpg")), 'JPEG')
                index = index + 1

        print("Successfully saved images for each page for {}".format(
            self.image_out_path))

        english_text = list()

        for filename in sorted(os.listdir(
                os.path.join(self.image_out_path, out_folder_name)),
                               key=lambda x: int(os.path.splitext(x)[0])):
            if filename.endswith("jpg"):
                text = str(((pytesseract.image_to_string(
                    Image.open(
                        os.path.join(self.image_out_path, out_folder_name,
                                     filename))))))
                text = text.replace('-\n', '')
                english_text.append(text)

        corpus = " ".join(english_text)
        corpus = re.sub(r'\n+', '\n', corpus).strip()
        corpus = TextBlob(corpus)
        for sentence in corpus.sentences:
            self.english.append(sentence.string.replace("\n", " "))
        print("English Text Extracted is : {}".format(self.english))
        shutil.rmtree(self.image_out_path)

Example #3

0

Show file

File: Pdf.py Project: ttyhu/Flask-OCR

def convert_to_jpg(input_pdf_file,
                   target_dir,
                   num_page=0,
                   fname_fmt="{num_page}.jpg"):
    if not os.path.exists(target_dir):
        # create folder if not exist
        os.makedirs(target_dir)

    images = pdf2image.convert_from_path(input_pdf_file,
                                         first_page=num_page + 1,
                                         last_page=num_page + 2)

    path_file = os.path.join(target_dir, fname_fmt.format(num_page=num_page))
    print('save : ' + path_file)
    images[0].save(path_file)

Example #4

0

Show file

File: core.py Project: erikkastelec/PDFScraper

def pdf_to_image(document: Document):
    tempfile_path = tempfile.gettempdir() + "/PDFScraper"
    try:
        os.makedirs(tempfile_path)
    except FileExistsError:
        pass

    if document.is_pdf:
        pages = pdf2image.convert_from_path(pdf_path=document.path, dpi=300)
        # TODO: implement saving to temp dir with mkstemp for better security
        for i in range(len(pages)):
            pages[i].save(tempfile_path + "/" + document.filename + "_" +
                          str(i) + ".jpg")
    else:
        img = cv2.imread(document.path)
        cv2.imwrite(tempfile_path + "/" + document.filename + "_0.jpg", img)

Example #5

0

Show file

File: Screen_PDFToImage.py Project: mrivem/PDFtools

    def convert_pdf(self, path):
        # This file paths
        this_file_prefix = os.path.splitext(os.path.split(path)[1])[0]
        this_save_path = os.path.join(self.save_path, this_file_prefix)

        # Create the output folder if it doesn't exist
        if not os.path.exists(this_save_path):
            os.mkdir(this_save_path)

        # Signal info to the main thread
        self.signal_progress.emit(self.progress)
        self.signal_message.emit(
            f"Now reading pages from {path}.. this might take a while")

        # Convert the pdf into an array of images (stored in the _TEMP folder)
        images = pdf2image.convert_from_path(path,
                                             dpi=72,
                                             output_folder="_TEMP")

        # Save each Image
        counter = 1
        for img in images:
            # Signal info to the main thread
            self.signal_message.emit(
                f"From {path}: Saving page {counter} of {len(images)}")
            this_progress = int(
                (counter / len(images) * 100) / len(self.paths))
            this_total_progress = self.progress + this_progress
            self.signal_progress.emit(this_total_progress)

            # Save the image
            img.save(
                os.path.join(
                    this_save_path,
                    f"{this_file_prefix}_{counter}.{self.option_output_format}"
                ), self.option_output_format)
            img.close()
            counter += 1

        self.signal_message.emit(f"Clearing the _TEMP folder...")
        for temp_file in os.listdir("_TEMP"):
            os.remove(os.path.join("_TEMP", temp_file))
        self.signal_message[str, int].emit(
            f"{path} processed successfully into {len(images)} images.", 2000)

Example #6

0

Show file

File: Pdf.py Project: rcln/pamparios-

def convert_to_jpg(input_pdf_file,
                   target_dir,
                   num_page=0,
                   fname_fmt="{num_page}.jpg"):
    """
    Convert a page of a pdf in jpg
    :param input_pdf_file: pdf path
    :param target_dir: the dest of images
    :param num_page: the page number to convert
    :param fname_fmt: the dest name
    """
    if not os.path.exists(target_dir):
        # create folder if not exist
        os.makedirs(target_dir)

    images = pdf2image.convert_from_path(input_pdf_file,
                                         first_page=num_page + 1,
                                         last_page=num_page + 2)

    path_file = os.path.join(target_dir, fname_fmt.format(num_page=num_page))
    images[0].save(path_file)

Example #7

0

Show file

File: api.py Project: MichaelMueller/ReportGenerator

def generate_report(dcm_sr_path, config_file, log_level, log_file):
    # logging
    setup_logging(log_level, log_file)
    logger = logging.getLogger(__name__)

    # files need to be deleted
    try:
        # LOAD CONFIG AND SETUP
        with open(config_file) as json_file:
            data = json.load(json_file)
            config = Config.create_from_dict(data)
            error_str = config.validate()
            if error_str:
                quit(error_str)
        config.add_paths()
        temp_dir_object = tempfile.TemporaryDirectory()
        temp_dir = temp_dir_object.name if config.temp_dir is None else config.temp_dir
        dcm_sr_filename = os.path.basename(os.path.splitext(dcm_sr_path)[0])
        output_file_name = config.output_file_name if config.output_file_name is not None else dcm_sr_filename
        output_dir = temp_dir_object.name if config.output_dir is None else config.output_dir
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # GENERATE XML FILE
        sr_xml_file = os.path.join(temp_dir, dcm_sr_filename + ".xml")
        logger.info("converting DICOM SR {} to XML file {}".format(
            dcm_sr_path, sr_xml_file))
        run_cmd("dsr2xml", *config.dsr2xml_exe_additional_options, dcm_sr_path,
                sr_xml_file)
        if config.target == "xml":
            sr_xml_file_output = os.path.join(output_dir,
                                              output_file_name + ".xml")
            shutil.move(sr_xml_file, sr_xml_file_output)
            logger.info("xml created in {}. quit requested.".format(
                sr_xml_file_output))
            sys.exit(0)

        # GENERATE TEMPLATE DATA: EXTRACT AND CONTENTS FROM XML USING XPATH
        logger.info("retrieving contents from XML file {}".format(sr_xml_file))
        root = ET.parse(sr_xml_file)
        template_data = {}
        for rule in config.rules:
            text = ""
            for rule_idx, xpath_expression in enumerate(
                    rule.xpath_expressions):
                xpath_result = root.xpath(xpath_expression)
                if isinstance(xpath_result, List):
                    xpath_result = rule.concat_string.join(xpath_result)

                if not isinstance(xpath_result, str):
                    quit(
                        "xpath did not produce text: \"{}\" in rule {}, index {}"
                        .format(xpath_expression, rule.name, str(rule_idx)))
                elif len(xpath_result) == 0:
                    logger.warning(
                        "empty text for xpath \"{}\" in rule {}, index {}".
                        format(xpath_expression, rule.name, str(rule_idx)))

                else:
                    logger.info("result for xpath {}: {}".format(
                        xpath_expression, xpath_result))
                    if text:
                        text = text + rule.concat_string
                    text = text + xpath_result
                    for search, replace in rule.replacements.items():
                        text = text.replace(search, replace)

            template_data[rule.name] = text
        logger.debug("template_data: {}".format(str(template_data)))

        # GENERATE FILLED TEMPLATE: LOAD TEMPLATE AND SET CONTENTS ON NAMED PLACEHOLDERS
        _, template_file_extension = os.path.splitext(config.template_path)
        template_is_word = template_file_extension == ".docx"
        filled_template_file = os.path.join(
            temp_dir, dcm_sr_filename + template_file_extension)
        logger.info(
            "replacing contents from template docx file {} into {}".format(
                config.template_path, filled_template_file))
        if template_is_word:
            replace_in_docx(config.template_path, template_data,
                            filled_template_file)
        else:
            replace_in_text_file(config.template_path, template_data,
                                 filled_template_file)
        if config.target == "template":
            filled_template_file_output = os.path.join(
                output_dir, output_file_name + template_file_extension)
            shutil.move(filled_template_file, filled_template_file_output)
            logger.info("template created in {}. quit requested.".format(
                filled_template_file_output))
            sys.exit(0)

        # CONVERT TO PDF
        pdf_tmp_file = os.path.join(temp_dir, dcm_sr_filename + ".pdf")
        logger.info("converting file {} into pdf file {}".format(
            filled_template_file, pdf_tmp_file))
        with suppress_stdout():
            if template_is_word:
                doc2pdf(filled_template_file, pdf_tmp_file)
            else:
                wkhtmltopdf = "wkhtmltopdf"
                if sys.platform == 'win32':
                    wkhtmltopdf += ".exe"
                pdfkit.from_file(filled_template_file,
                                 pdf_tmp_file,
                                 configuration=pdfkit.pdfkit.Configuration(
                                     wkhtmltopdf=wkhtmltopdf))
        if config.target == "pdf":
            pdf_output_file_path = os.path.join(output_dir,
                                                output_file_name + ".pdf")
            shutil.move(pdf_tmp_file, pdf_output_file_path)
            logger.info("pdf file created in {}. quit requested.".format(
                pdf_output_file_path))
            sys.exit(0)

        # CONVERT TO DICOM
        dcm_files = []
        # GENERATE DICOM PDF
        if config.target == "dcm_pdf":
            # CONVERT TO DICOM PDF
            dcm_pdf_tmp_file = os.path.join(output_dir,
                                            output_file_name + ".pdf.dcm")
            sop_instance_uid = generate_dcm_uid(config.oid_root,
                                                sha256sum(dcm_sr_path))
            logger.info("converting file {} into DICOM pdf file {}".format(
                pdf_tmp_file, dcm_pdf_tmp_file))
            run_cmd("pdf2dcm", pdf_tmp_file, dcm_pdf_tmp_file, "--series-from",
                    dcm_sr_path, *config.pdf2dcm_exe_additional_options,
                    "--key", "0008,0018={}".format(sop_instance_uid))
            dcm_files.append(dcm_pdf_tmp_file)
        # GENERATE DICOM IMAGE STUDY (DEFAULT TARGET)
        else:
            images = pdf2image.convert_from_path(pdf_tmp_file,
                                                 paths_only=True,
                                                 output_folder=temp_dir,
                                                 fmt="jpg")
            for idx, image in enumerate(images):
                # Do something here
                dcm_file = os.path.join(
                    output_dir,
                    output_file_name + "_image" + str(idx + 1) + ".dcm")
                logger.info("converting image {} into DICOM file {}".format(
                    image, dcm_file))
                sop_instance_uid = generate_dcm_uid(config.oid_root,
                                                    sha256sum(image))

                run_cmd("img2dcm",
                        "--series-from",
                        dcm_sr_path,
                        *config.img2dcm_exe_additional_options,
                        image,
                        dcm_file,
                        "--key",
                        "0008,0060=OT",
                        "--key",
                        "0020,0013={}".format(idx + 1),
                        "--key",
                        "0020,0013={}".format(idx + 1),
                        "--key",
                        "0008,0018={}".format(sop_instance_uid),
                        print_stdout=True)
                dcm_files.append(dcm_file)

        # SEND TO DICOM NODE
        if len(dcm_files) > 0 and config.dcm_send_ip:
            if config.dcm_send_dcm_sr:
                dcm_files.append(dcm_sr_path)
            for dcm_file in dcm_files:
                logger.info("sending file {} to dicom node".format(dcm_file))
                # run_cmd("dcmsend", "localhost", "2727", dcm_sr_path)
                run_cmd("dcmsend",
                        config.dcm_send_ip,
                        str(config.dcm_send_port),
                        dcm_file,
                        *config.dcmsend_exe_additional_options,
                        print_stdout=False)

    except Exception as error:
        logger.exception(error)