def save_multiple_figs_to_image_file(fig_list, out_image, width=16, height=9, close_figs=True): tmp_pdf = '/tmp/______{}_tmp_pdf'.format(u4()) save_figs_to_pdf(tmp_pdf, fig_list, width=width, height=height, close_figs=True) # Convert pdf to images image_list = [np.asarray(x) for x in pdf2image.convert_from_path(tmp_pdf)] os.remove(tmp_pdf) # Join images & save file join_images_vertical(image_list, out_image)
def extract_text(self): PDF_file = self.filename out_folder_name = os.path.basename(self.filename) if not os.path.exists(self.image_out_path): os.mkdir(self.image_out_path) if not os.path.exists(os.path.abspath(os.path.join(self.image_out_path,\ out_folder_name))): os.mkdir(os.path.abspath(os.path.join(self.image_out_path,\ out_folder_name))) index = 0 maxPages = pdf2image._page_count(PDF_file) for page in range(0, maxPages, 10): pages = pdf2image.convert_from_path(PDF_file, dpi=200, first_page=page, last_page=min( page + 10 - 1, maxPages)) for tpage in pages: tpage.save( os.path.abspath( os.path.join(self.image_out_path, out_folder_name, str(index) + ".jpg")), 'JPEG') index = index + 1 print("Successfully saved images for each page for {}".format( self.image_out_path)) english_text = list() for filename in sorted(os.listdir( os.path.join(self.image_out_path, out_folder_name)), key=lambda x: int(os.path.splitext(x)[0])): if filename.endswith("jpg"): text = str(((pytesseract.image_to_string( Image.open( os.path.join(self.image_out_path, out_folder_name, filename)))))) text = text.replace('-\n', '') english_text.append(text) corpus = " ".join(english_text) corpus = re.sub(r'\n+', '\n', corpus).strip() corpus = TextBlob(corpus) for sentence in corpus.sentences: self.english.append(sentence.string.replace("\n", " ")) print("English Text Extracted is : {}".format(self.english)) shutil.rmtree(self.image_out_path)
def convert_to_jpg(input_pdf_file, target_dir, num_page=0, fname_fmt="{num_page}.jpg"): if not os.path.exists(target_dir): # create folder if not exist os.makedirs(target_dir) images = pdf2image.convert_from_path(input_pdf_file, first_page=num_page + 1, last_page=num_page + 2) path_file = os.path.join(target_dir, fname_fmt.format(num_page=num_page)) print('save : ' + path_file) images[0].save(path_file)
def pdf_to_image(document: Document): tempfile_path = tempfile.gettempdir() + "/PDFScraper" try: os.makedirs(tempfile_path) except FileExistsError: pass if document.is_pdf: pages = pdf2image.convert_from_path(pdf_path=document.path, dpi=300) # TODO: implement saving to temp dir with mkstemp for better security for i in range(len(pages)): pages[i].save(tempfile_path + "/" + document.filename + "_" + str(i) + ".jpg") else: img = cv2.imread(document.path) cv2.imwrite(tempfile_path + "/" + document.filename + "_0.jpg", img)
def convert_pdf(self, path): # This file paths this_file_prefix = os.path.splitext(os.path.split(path)[1])[0] this_save_path = os.path.join(self.save_path, this_file_prefix) # Create the output folder if it doesn't exist if not os.path.exists(this_save_path): os.mkdir(this_save_path) # Signal info to the main thread self.signal_progress.emit(self.progress) self.signal_message.emit( f"Now reading pages from {path}.. this might take a while") # Convert the pdf into an array of images (stored in the _TEMP folder) images = pdf2image.convert_from_path(path, dpi=72, output_folder="_TEMP") # Save each Image counter = 1 for img in images: # Signal info to the main thread self.signal_message.emit( f"From {path}: Saving page {counter} of {len(images)}") this_progress = int( (counter / len(images) * 100) / len(self.paths)) this_total_progress = self.progress + this_progress self.signal_progress.emit(this_total_progress) # Save the image img.save( os.path.join( this_save_path, f"{this_file_prefix}_{counter}.{self.option_output_format}" ), self.option_output_format) img.close() counter += 1 self.signal_message.emit(f"Clearing the _TEMP folder...") for temp_file in os.listdir("_TEMP"): os.remove(os.path.join("_TEMP", temp_file)) self.signal_message[str, int].emit( f"{path} processed successfully into {len(images)} images.", 2000)
def convert_to_jpg(input_pdf_file, target_dir, num_page=0, fname_fmt="{num_page}.jpg"): """ Convert a page of a pdf in jpg :param input_pdf_file: pdf path :param target_dir: the dest of images :param num_page: the page number to convert :param fname_fmt: the dest name """ if not os.path.exists(target_dir): # create folder if not exist os.makedirs(target_dir) images = pdf2image.convert_from_path(input_pdf_file, first_page=num_page + 1, last_page=num_page + 2) path_file = os.path.join(target_dir, fname_fmt.format(num_page=num_page)) images[0].save(path_file)
def generate_report(dcm_sr_path, config_file, log_level, log_file): # logging setup_logging(log_level, log_file) logger = logging.getLogger(__name__) # files need to be deleted try: # LOAD CONFIG AND SETUP with open(config_file) as json_file: data = json.load(json_file) config = Config.create_from_dict(data) error_str = config.validate() if error_str: quit(error_str) config.add_paths() temp_dir_object = tempfile.TemporaryDirectory() temp_dir = temp_dir_object.name if config.temp_dir is None else config.temp_dir dcm_sr_filename = os.path.basename(os.path.splitext(dcm_sr_path)[0]) output_file_name = config.output_file_name if config.output_file_name is not None else dcm_sr_filename output_dir = temp_dir_object.name if config.output_dir is None else config.output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) # GENERATE XML FILE sr_xml_file = os.path.join(temp_dir, dcm_sr_filename + ".xml") logger.info("converting DICOM SR {} to XML file {}".format( dcm_sr_path, sr_xml_file)) run_cmd("dsr2xml", *config.dsr2xml_exe_additional_options, dcm_sr_path, sr_xml_file) if config.target == "xml": sr_xml_file_output = os.path.join(output_dir, output_file_name + ".xml") shutil.move(sr_xml_file, sr_xml_file_output) logger.info("xml created in {}. quit requested.".format( sr_xml_file_output)) sys.exit(0) # GENERATE TEMPLATE DATA: EXTRACT AND CONTENTS FROM XML USING XPATH logger.info("retrieving contents from XML file {}".format(sr_xml_file)) root = ET.parse(sr_xml_file) template_data = {} for rule in config.rules: text = "" for rule_idx, xpath_expression in enumerate( rule.xpath_expressions): xpath_result = root.xpath(xpath_expression) if isinstance(xpath_result, List): xpath_result = rule.concat_string.join(xpath_result) if not isinstance(xpath_result, str): quit( "xpath did not produce text: \"{}\" in rule {}, index {}" .format(xpath_expression, rule.name, str(rule_idx))) elif len(xpath_result) == 0: logger.warning( "empty text for xpath \"{}\" in rule {}, index {}". format(xpath_expression, rule.name, str(rule_idx))) else: logger.info("result for xpath {}: {}".format( xpath_expression, xpath_result)) if text: text = text + rule.concat_string text = text + xpath_result for search, replace in rule.replacements.items(): text = text.replace(search, replace) template_data[rule.name] = text logger.debug("template_data: {}".format(str(template_data))) # GENERATE FILLED TEMPLATE: LOAD TEMPLATE AND SET CONTENTS ON NAMED PLACEHOLDERS _, template_file_extension = os.path.splitext(config.template_path) template_is_word = template_file_extension == ".docx" filled_template_file = os.path.join( temp_dir, dcm_sr_filename + template_file_extension) logger.info( "replacing contents from template docx file {} into {}".format( config.template_path, filled_template_file)) if template_is_word: replace_in_docx(config.template_path, template_data, filled_template_file) else: replace_in_text_file(config.template_path, template_data, filled_template_file) if config.target == "template": filled_template_file_output = os.path.join( output_dir, output_file_name + template_file_extension) shutil.move(filled_template_file, filled_template_file_output) logger.info("template created in {}. quit requested.".format( filled_template_file_output)) sys.exit(0) # CONVERT TO PDF pdf_tmp_file = os.path.join(temp_dir, dcm_sr_filename + ".pdf") logger.info("converting file {} into pdf file {}".format( filled_template_file, pdf_tmp_file)) with suppress_stdout(): if template_is_word: doc2pdf(filled_template_file, pdf_tmp_file) else: wkhtmltopdf = "wkhtmltopdf" if sys.platform == 'win32': wkhtmltopdf += ".exe" pdfkit.from_file(filled_template_file, pdf_tmp_file, configuration=pdfkit.pdfkit.Configuration( wkhtmltopdf=wkhtmltopdf)) if config.target == "pdf": pdf_output_file_path = os.path.join(output_dir, output_file_name + ".pdf") shutil.move(pdf_tmp_file, pdf_output_file_path) logger.info("pdf file created in {}. quit requested.".format( pdf_output_file_path)) sys.exit(0) # CONVERT TO DICOM dcm_files = [] # GENERATE DICOM PDF if config.target == "dcm_pdf": # CONVERT TO DICOM PDF dcm_pdf_tmp_file = os.path.join(output_dir, output_file_name + ".pdf.dcm") sop_instance_uid = generate_dcm_uid(config.oid_root, sha256sum(dcm_sr_path)) logger.info("converting file {} into DICOM pdf file {}".format( pdf_tmp_file, dcm_pdf_tmp_file)) run_cmd("pdf2dcm", pdf_tmp_file, dcm_pdf_tmp_file, "--series-from", dcm_sr_path, *config.pdf2dcm_exe_additional_options, "--key", "0008,0018={}".format(sop_instance_uid)) dcm_files.append(dcm_pdf_tmp_file) # GENERATE DICOM IMAGE STUDY (DEFAULT TARGET) else: images = pdf2image.convert_from_path(pdf_tmp_file, paths_only=True, output_folder=temp_dir, fmt="jpg") for idx, image in enumerate(images): # Do something here dcm_file = os.path.join( output_dir, output_file_name + "_image" + str(idx + 1) + ".dcm") logger.info("converting image {} into DICOM file {}".format( image, dcm_file)) sop_instance_uid = generate_dcm_uid(config.oid_root, sha256sum(image)) run_cmd("img2dcm", "--series-from", dcm_sr_path, *config.img2dcm_exe_additional_options, image, dcm_file, "--key", "0008,0060=OT", "--key", "0020,0013={}".format(idx + 1), "--key", "0020,0013={}".format(idx + 1), "--key", "0008,0018={}".format(sop_instance_uid), print_stdout=True) dcm_files.append(dcm_file) # SEND TO DICOM NODE if len(dcm_files) > 0 and config.dcm_send_ip: if config.dcm_send_dcm_sr: dcm_files.append(dcm_sr_path) for dcm_file in dcm_files: logger.info("sending file {} to dicom node".format(dcm_file)) # run_cmd("dcmsend", "localhost", "2727", dcm_sr_path) run_cmd("dcmsend", config.dcm_send_ip, str(config.dcm_send_port), dcm_file, *config.dcmsend_exe_additional_options, print_stdout=False) except Exception as error: logger.exception(error)