コード例 #1
0
def extract_figures_json(pdf_path, page_image_paths, pdffigures_output,
                         output_directory):
    """Extract information about figures to JSON and save to disk.

    :param str pdf_path: path to the PDF from which to extract
      figures.

    :returns: path to the JSON file containing the detection results.
    """
    page_images_array = np.array(
        [imread(page_image_path) for page_image_path in page_image_paths])
    detector = get_detector()
    figure_boxes_by_page = detector.get_detections(page_images_array)
    pdffigures_captions = pdffigures_wrapper.get_captions(
        pdffigures_output=pdffigures_output,
        target_dpi=settings.DEFAULT_INFERENCE_DPI)
    figures_by_page = []
    for page_num in range(len(page_image_paths)):
        figure_boxes = figure_boxes_by_page[page_num]
        pf_page_captions = [
            caption for caption in pdffigures_captions
            if caption.page == page_num
        ]
        caption_boxes = [
            caption.caption_boundary for caption in pf_page_captions
        ]
        figure_indices, caption_indices = figure_utils.pair_boxes(
            figure_boxes, caption_boxes)
        page_image = page_images_array[page_num]
        pad_pixels = PAD_FACTOR * min(page_image.shape[:2])
        for (figure_idx, caption_idx) in zip(figure_indices, caption_indices):
            figures_by_page.append(
                Figure(figure_boundary=figure_boxes[figure_idx].expand_box(
                    pad_pixels).crop_to_page(
                        page_image.shape).crop_whitespace_edges(page_image),
                       caption_boundary=caption_boxes[caption_idx],
                       caption_text=pf_page_captions[caption_idx].caption_text,
                       name=pf_page_captions[caption_idx].name,
                       figure_type=pf_page_captions[caption_idx].figure_type,
                       page=page_num))
    pdf_detection_result = PdfDetectionResult(
        pdf=pdf_path,
        figures=figures_by_page,
        dpi=settings.DEFAULT_INFERENCE_DPI,
        raw_detected_boxes=figure_boxes_by_page,
        raw_pdffigures_output=pdffigures_output)

    output_path = os.path.join(
        output_directory,
        os.path.basename(pdf_path)[:-4] + 'deepfigures-results.json')
    file_util.write_json_atomic(output_path,
                                pdf_detection_result.to_dict(),
                                indent=2,
                                sort_keys=True)
    return output_path
コード例 #2
0
def detect_batch(src_pdfs: List[str],
                 detector: TensorboxCaptionmaskDetector,
                 conf_threshold: float = .5) -> Iterable[PdfDetectionResult]:
    for src_pdf in src_pdfs:
        with tempfile.TemporaryDirectory(
                prefix='deepfigures-tensorbox') as working_dir:
            pdf_path = os.path.join(working_dir, src_pdf.replace('/', '_'))
            file_util.copy(src_pdf, pdf_path)
            pdffigures_output = pdffigures_extractor.extract(
                pdf_path, working_dir)
            pdffigures_captions = pdffigures_wrapper.get_captions(
                pdffigures_output)
            figures_by_page, figure_boxes_by_page = detect_figures(
                pdf_path,
                pdffigures_captions,
                detector,
                conf_threshold=conf_threshold)
            yield PdfDetectionResult(pdf=src_pdf,
                                     figures=figures_by_page,
                                     dpi=settings.DEFAULT_INFERENCE_DPI,
                                     raw_detected_boxes=figure_boxes_by_page,
                                     raw_pdffigures_output=pdffigures_output)