コード例 #1
0
    def _split_pdf(self, input_folder: dataiku.Folder,
                   output_folder: dataiku.Folder,
                   input_path: AnyStr) -> List[AnyStr]:
        """Split a PDF file into multiple pages and save them as separate files in another folder

        Args:
            input_folder: `dataiku.Folder` where the input PDF file is stored
            output_folder: `dataiku.Folder` where files will be saved
            input_path: path of the input PDF file in the `input_folder`

        Returns:
            List of paths generated in the `output_folder`

        """
        with input_folder.get_download_stream(input_path) as stream:
            input_pdf = PdfFileReader(BytesIO(stream.read()))
        input_path_without_file_name = os.path.split(input_path)[0]
        input_file_name_without_extension = os.path.splitext(
            os.path.basename(input_path))[0]
        output_path_list = []
        for page in range(input_pdf.getNumPages()):
            pdf_writer = PdfFileWriter()
            pdf_writer.addPage(input_pdf.getPage(page))
            output_path = f"{input_path_without_file_name}/{input_file_name_without_extension}_page_{page + 1}.pdf"
            pdf_bytes = BytesIO()
            pdf_writer.write(pdf_bytes)
            output_folder.upload_stream(output_path, pdf_bytes.getvalue())
            output_path_list.append(output_path)
        return output_path_list
    def format_save_image(self, output_folder: dataiku.Folder,
                          image_path: AnyStr, response: Dict) -> bool:
        """Generic method to apply `self.format_image` to an image in `self.input_folder` and save it to an `output folder`

        Do not override this method!

        """
        result = False
        with self.input_folder.get_download_stream(image_path) as stream:
            try:
                pil_image = Image.open(stream)
                if len(response) != 0:
                    formatted_image = self.format_image(pil_image, response)
                else:
                    formatted_image = pil_image.copy()
                image_bytes = save_image_bytes(formatted_image, image_path)
                output_folder.upload_stream(image_path, image_bytes.getvalue())
                result = True
            except self.IMAGE_FORMATTING_EXCEPTIONS as error:
                logging.warning(
                    f"Could not format image on path: {image_path} because of error: {error}"
                )
                if self.error_handling == ErrorHandling.FAIL:
                    logging.exception(error)
        return result
コード例 #3
0
    def _merge_pdf(
        self,
        input_folder: dataiku.Folder,
        output_folder: dataiku.Folder,
        input_path_list: List[AnyStr],
        output_path: AnyStr,
    ) -> AnyStr:
        """Merge several PDF files into a single one

        Args:
            input_folder: `dataiku.Folder` where the input PDF files are stored
            output_folder: `dataiku.Folder` where the merged PDF file will be saved
            input_path_list:  List of PDF file paths in the `input_folder`
            output_path: Path of the merged PDF file

        Returns:
            Path of the merged PDF file

        """
        pdf_writer = PdfFileWriter()
        # Merge all PDF paths in the list
        for path in input_path_list:
            with input_folder.get_download_stream(path) as stream:
                input_pdf = PdfFileReader(BytesIO(stream.read()))
            for page in range(input_pdf.getNumPages()):
                pdf_writer.addPage(input_pdf.getPage(page))
        # Save the merged PDF in the output folder
        pdf_bytes = BytesIO()
        pdf_writer.write(pdf_bytes)
        output_folder.upload_stream(output_path, pdf_bytes.getvalue())
        return output_path
コード例 #4
0
    def _split_tiff(self, input_folder: dataiku.Folder,
                    output_folder: dataiku.Folder,
                    input_path: AnyStr) -> List[AnyStr]:
        """Split a TIFF file into multiple pages and save them as separate files in another folder

        Args:
            input_folder: `dataiku.Folder` where the input TIFF file is stored
            output_folder: `dataiku.Folder` where files will be saved
            input_path: path of the input TIFF file in the `input_folder`

        Returns:
            List of paths generated in the `output_folder`

        """
        with input_folder.get_download_stream(input_path) as stream:
            pil_image = Image.open(stream)
        input_path_without_file_name = os.path.split(input_path)[0]
        input_file_name_without_extension = os.path.splitext(
            os.path.basename(input_path))[0]
        page = 0
        output_path_list = []
        while True:
            try:
                pil_image.seek(page)
                output_path = f"{input_path_without_file_name}/{input_file_name_without_extension}_page_{page+1}.tiff"
                image_bytes = BytesIO()
                pil_image.save(image_bytes, format="TIFF")
                output_folder.upload_stream(output_path,
                                            image_bytes.getvalue())
                output_path_list.append(output_path)
                page += 1
            except EOFError:
                break
        return output_path_list
コード例 #5
0
def save_array_to_folder(array: np.array,
                         path: AnyStr,
                         folder: dataiku.Folder,
                         compress: bool = True) -> None:
    """Save a numpy array to a Dataiku folder"""
    with NamedTemporaryFile() as tmp:
        if compress:
            np.savez_compressed(tmp, array)
        else:
            np.savez(tmp, array)
        _ = tmp.seek(0)  # Oh, take me back to the start
        folder.upload_stream(path, tmp)
 def format_save_pdf_document(self, output_folder: dataiku.Folder, pdf_path: AnyStr, response: Dict) -> bool:
     """Open a PDF file in a `dataiku.Folder`, draw text bounding polygons and save it to another folder"""
     result = False
     with self.input_folder.get_download_stream(pdf_path) as stream:
         try:
             pdf = PdfReader(BytesIO(stream.read()))
             if len(response) != 0:
                 pdf = self.format_pdf_document(pdf, response)
                 pdf_bytes = self.doc_handler.save_pdf_bytes(pdf)
                 output_folder.upload_stream(pdf_path, pdf_bytes.getvalue())
             result = True
         except (PdfError, ValueError, TypeError, OSError) as error:
             logging.warning(f"Could not annotate PDF on path: {pdf_path} because of error: {error}")
             if self.error_handling == ErrorHandling.FAIL:
                 logging.exception(error)
     return result
 def format_save_image(self, output_folder: dataiku.Folder,
                       image_path: AnyStr, response: Dict) -> bool:
     result = False
     with self.input_folder.get_download_stream(image_path) as stream:
         try:
             pil_image = Image.open(stream)
             if len(response) != 0:
                 formatted_image = self.format_image(pil_image, response)
             else:
                 formatted_image = pil_image.copy()
             image_bytes = save_image_bytes(formatted_image, image_path)
             output_folder.upload_stream(image_path, image_bytes.getvalue())
             result = True
         except (UnidentifiedImageError, TypeError, OSError) as e:
             logging.warning("Could not load image on path: " + image_path)
             if self.error_handling == ErrorHandlingEnum.FAIL:
                 raise e
     return result
コード例 #8
0
    def _merge_tiff(
        self,
        input_folder: dataiku.Folder,
        output_folder: dataiku.Folder,
        input_path_list: List[AnyStr],
        output_path: AnyStr,
    ) -> AnyStr:
        """Merge several TIFF files into a single one

        Args:
            input_folder: `dataiku.Folder` where the input TIFF files are stored
            output_folder: `dataiku.Folder` where the merged TIFF file will be saved
            input_path_list:  List of TIFF file paths in the `input_folder`
            output_path: Path of the merged TIFF file

        Returns:
            Path of the merged TIFF file

        """
        # Load all TIFF images in a list
        image_list = []
        for input_path in input_path_list:
            with input_folder.get_download_stream(input_path) as stream:
                image_list.append(Image.open(stream))
        # Save them to a single image object
        image_bytes = BytesIO()
        if len(image_list) > 1:
            image_list[0].save(image_bytes,
                               append_images=image_list[1:],
                               save_all=True,
                               format="TIFF")
        else:
            image_list[0].save(image_bytes, format="TIFF")
        # Save image to output_folder
        output_folder.upload_stream(output_path, image_bytes.getvalue())
        return output_path