def _split_pdf(self, input_folder: dataiku.Folder, output_folder: dataiku.Folder, input_path: AnyStr) -> List[AnyStr]: """Split a PDF file into multiple pages and save them as separate files in another folder Args: input_folder: `dataiku.Folder` where the input PDF file is stored output_folder: `dataiku.Folder` where files will be saved input_path: path of the input PDF file in the `input_folder` Returns: List of paths generated in the `output_folder` """ with input_folder.get_download_stream(input_path) as stream: input_pdf = PdfFileReader(BytesIO(stream.read())) input_path_without_file_name = os.path.split(input_path)[0] input_file_name_without_extension = os.path.splitext( os.path.basename(input_path))[0] output_path_list = [] for page in range(input_pdf.getNumPages()): pdf_writer = PdfFileWriter() pdf_writer.addPage(input_pdf.getPage(page)) output_path = f"{input_path_without_file_name}/{input_file_name_without_extension}_page_{page + 1}.pdf" pdf_bytes = BytesIO() pdf_writer.write(pdf_bytes) output_folder.upload_stream(output_path, pdf_bytes.getvalue()) output_path_list.append(output_path) return output_path_list
def format_save_image(self, output_folder: dataiku.Folder, image_path: AnyStr, response: Dict) -> bool: """Generic method to apply `self.format_image` to an image in `self.input_folder` and save it to an `output folder` Do not override this method! """ result = False with self.input_folder.get_download_stream(image_path) as stream: try: pil_image = Image.open(stream) if len(response) != 0: formatted_image = self.format_image(pil_image, response) else: formatted_image = pil_image.copy() image_bytes = save_image_bytes(formatted_image, image_path) output_folder.upload_stream(image_path, image_bytes.getvalue()) result = True except self.IMAGE_FORMATTING_EXCEPTIONS as error: logging.warning( f"Could not format image on path: {image_path} because of error: {error}" ) if self.error_handling == ErrorHandling.FAIL: logging.exception(error) return result
def _merge_pdf( self, input_folder: dataiku.Folder, output_folder: dataiku.Folder, input_path_list: List[AnyStr], output_path: AnyStr, ) -> AnyStr: """Merge several PDF files into a single one Args: input_folder: `dataiku.Folder` where the input PDF files are stored output_folder: `dataiku.Folder` where the merged PDF file will be saved input_path_list: List of PDF file paths in the `input_folder` output_path: Path of the merged PDF file Returns: Path of the merged PDF file """ pdf_writer = PdfFileWriter() # Merge all PDF paths in the list for path in input_path_list: with input_folder.get_download_stream(path) as stream: input_pdf = PdfFileReader(BytesIO(stream.read())) for page in range(input_pdf.getNumPages()): pdf_writer.addPage(input_pdf.getPage(page)) # Save the merged PDF in the output folder pdf_bytes = BytesIO() pdf_writer.write(pdf_bytes) output_folder.upload_stream(output_path, pdf_bytes.getvalue()) return output_path
def _split_tiff(self, input_folder: dataiku.Folder, output_folder: dataiku.Folder, input_path: AnyStr) -> List[AnyStr]: """Split a TIFF file into multiple pages and save them as separate files in another folder Args: input_folder: `dataiku.Folder` where the input TIFF file is stored output_folder: `dataiku.Folder` where files will be saved input_path: path of the input TIFF file in the `input_folder` Returns: List of paths generated in the `output_folder` """ with input_folder.get_download_stream(input_path) as stream: pil_image = Image.open(stream) input_path_without_file_name = os.path.split(input_path)[0] input_file_name_without_extension = os.path.splitext( os.path.basename(input_path))[0] page = 0 output_path_list = [] while True: try: pil_image.seek(page) output_path = f"{input_path_without_file_name}/{input_file_name_without_extension}_page_{page+1}.tiff" image_bytes = BytesIO() pil_image.save(image_bytes, format="TIFF") output_folder.upload_stream(output_path, image_bytes.getvalue()) output_path_list.append(output_path) page += 1 except EOFError: break return output_path_list
def save_array_to_folder(array: np.array, path: AnyStr, folder: dataiku.Folder, compress: bool = True) -> None: """Save a numpy array to a Dataiku folder""" with NamedTemporaryFile() as tmp: if compress: np.savez_compressed(tmp, array) else: np.savez(tmp, array) _ = tmp.seek(0) # Oh, take me back to the start folder.upload_stream(path, tmp)
def format_save_pdf_document(self, output_folder: dataiku.Folder, pdf_path: AnyStr, response: Dict) -> bool: """Open a PDF file in a `dataiku.Folder`, draw text bounding polygons and save it to another folder""" result = False with self.input_folder.get_download_stream(pdf_path) as stream: try: pdf = PdfReader(BytesIO(stream.read())) if len(response) != 0: pdf = self.format_pdf_document(pdf, response) pdf_bytes = self.doc_handler.save_pdf_bytes(pdf) output_folder.upload_stream(pdf_path, pdf_bytes.getvalue()) result = True except (PdfError, ValueError, TypeError, OSError) as error: logging.warning(f"Could not annotate PDF on path: {pdf_path} because of error: {error}") if self.error_handling == ErrorHandling.FAIL: logging.exception(error) return result
def format_save_image(self, output_folder: dataiku.Folder, image_path: AnyStr, response: Dict) -> bool: result = False with self.input_folder.get_download_stream(image_path) as stream: try: pil_image = Image.open(stream) if len(response) != 0: formatted_image = self.format_image(pil_image, response) else: formatted_image = pil_image.copy() image_bytes = save_image_bytes(formatted_image, image_path) output_folder.upload_stream(image_path, image_bytes.getvalue()) result = True except (UnidentifiedImageError, TypeError, OSError) as e: logging.warning("Could not load image on path: " + image_path) if self.error_handling == ErrorHandlingEnum.FAIL: raise e return result
def _merge_tiff( self, input_folder: dataiku.Folder, output_folder: dataiku.Folder, input_path_list: List[AnyStr], output_path: AnyStr, ) -> AnyStr: """Merge several TIFF files into a single one Args: input_folder: `dataiku.Folder` where the input TIFF files are stored output_folder: `dataiku.Folder` where the merged TIFF file will be saved input_path_list: List of TIFF file paths in the `input_folder` output_path: Path of the merged TIFF file Returns: Path of the merged TIFF file """ # Load all TIFF images in a list image_list = [] for input_path in input_path_list: with input_folder.get_download_stream(input_path) as stream: image_list.append(Image.open(stream)) # Save them to a single image object image_bytes = BytesIO() if len(image_list) > 1: image_list[0].save(image_bytes, append_images=image_list[1:], save_all=True, format="TIFF") else: image_list[0].save(image_bytes, format="TIFF") # Save image to output_folder output_folder.upload_stream(output_path, image_bytes.getvalue()) return output_path