def extract_figures_json(pdf_path, page_image_paths, pdffigures_output, output_directory): """Extract information about figures to JSON and save to disk. :param str pdf_path: path to the PDF from which to extract figures. :returns: path to the JSON file containing the detection results. """ page_images_array = np.array( [imread(page_image_path) for page_image_path in page_image_paths]) detector = get_detector() figure_boxes_by_page = detector.get_detections(page_images_array) pdffigures_captions = pdffigures_wrapper.get_captions( pdffigures_output=pdffigures_output, target_dpi=settings.DEFAULT_INFERENCE_DPI) figures_by_page = [] for page_num in range(len(page_image_paths)): figure_boxes = figure_boxes_by_page[page_num] pf_page_captions = [ caption for caption in pdffigures_captions if caption.page == page_num ] caption_boxes = [ caption.caption_boundary for caption in pf_page_captions ] figure_indices, caption_indices = figure_utils.pair_boxes( figure_boxes, caption_boxes) page_image = page_images_array[page_num] pad_pixels = PAD_FACTOR * min(page_image.shape[:2]) for (figure_idx, caption_idx) in zip(figure_indices, caption_indices): figures_by_page.append( Figure(figure_boundary=figure_boxes[figure_idx].expand_box( pad_pixels).crop_to_page( page_image.shape).crop_whitespace_edges(page_image), caption_boundary=caption_boxes[caption_idx], caption_text=pf_page_captions[caption_idx].caption_text, name=pf_page_captions[caption_idx].name, figure_type=pf_page_captions[caption_idx].figure_type, page=page_num)) pdf_detection_result = PdfDetectionResult( pdf=pdf_path, figures=figures_by_page, dpi=settings.DEFAULT_INFERENCE_DPI, raw_detected_boxes=figure_boxes_by_page, raw_pdffigures_output=pdffigures_output) output_path = os.path.join( output_directory, os.path.basename(pdf_path)[:-4] + 'deepfigures-results.json') file_util.write_json_atomic(output_path, pdf_detection_result.to_dict(), indent=2, sort_keys=True) return output_path
def detect_batch(src_pdfs: List[str], detector: TensorboxCaptionmaskDetector, conf_threshold: float = .5) -> Iterable[PdfDetectionResult]: for src_pdf in src_pdfs: with tempfile.TemporaryDirectory( prefix='deepfigures-tensorbox') as working_dir: pdf_path = os.path.join(working_dir, src_pdf.replace('/', '_')) file_util.copy(src_pdf, pdf_path) pdffigures_output = pdffigures_extractor.extract( pdf_path, working_dir) pdffigures_captions = pdffigures_wrapper.get_captions( pdffigures_output) figures_by_page, figure_boxes_by_page = detect_figures( pdf_path, pdffigures_captions, detector, conf_threshold=conf_threshold) yield PdfDetectionResult(pdf=src_pdf, figures=figures_by_page, dpi=settings.DEFAULT_INFERENCE_DPI, raw_detected_boxes=figure_boxes_by_page, raw_pdffigures_output=pdffigures_output)