Ejemplo n.º 1
0
def run_full_pipeline(
    tarpath: str, skip_done: bool=True, save_intermediate: bool=False
) -> None:
    foldername = str(os.path.basename(tarpath).split('.')[0])
    result_path = LOCAL_FIGURE_JSON_DIR + get_bin(
        tarpath
    ) + foldername + '.json'
    if skip_done and file_util.exists(result_path):
        return
    d = LOCAL_INTERMEDIATE_DIR + get_bin(tarpath)
    while True:
        try:
            file_util.extract_tarfile(tarpath, d, streaming=False)
            # botocore.vendored.requests.packages.urllib3.exceptions.ReadTimeoutError can't be caught because it doesn't
            # inherit from BaseException, so don't use streaming
            break
        except FileNotFoundError as e:
            logging.exception('Failure reading %s, retrying' % tarpath)
        except ReadTimeout as e:
            logging.exception('Timeout reading %s, retrying' % tarpath)
    pdfs = glob.glob(d + foldername + '/' + '*.pdf')
    res = dict()
    for pdf in pdfs:
        sha1sum = file_util.compute_sha1(pdf)
        with open(pdf + '.sha1', 'w') as f:
            print(sha1sum, file=f)
        paper_figures = match_figures(pdf)
        if paper_figures is not None:
            res.update(paper_figures)
    if save_intermediate:
        intermediate_path = PUBMED_INTERMEDIATE_DIR + get_bin(
            tarpath
        ) + foldername + '/'
        for file in glob.glob(d + '/' + foldername + '/' + '*'):
            file_util.copy(file, intermediate_path + os.path.basename(file))
    file_util.safe_makedirs(os.path.dirname(result_path))
    file_util.write_json_atomic(
        result_path,
        config.JsonSerializable.serialize(res),
        indent=2,
        sort_keys=True
    )
Ejemplo n.º 2
0
def detect_batch(src_pdfs: List[str],
                 detector: TensorboxCaptionmaskDetector,
                 conf_threshold: float = .5) -> Iterable[PdfDetectionResult]:
    for src_pdf in src_pdfs:
        with tempfile.TemporaryDirectory(
                prefix='deepfigures-tensorbox') as working_dir:
            pdf_path = os.path.join(working_dir, src_pdf.replace('/', '_'))
            file_util.copy(src_pdf, pdf_path)
            pdffigures_output = pdffigures_extractor.extract(
                pdf_path, working_dir)
            pdffigures_captions = pdffigures_wrapper.get_captions(
                pdffigures_output)
            figures_by_page, figure_boxes_by_page = detect_figures(
                pdf_path,
                pdffigures_captions,
                detector,
                conf_threshold=conf_threshold)
            yield PdfDetectionResult(pdf=src_pdf,
                                     figures=figures_by_page,
                                     dpi=settings.DEFAULT_INFERENCE_DPI,
                                     raw_detected_boxes=figure_boxes_by_page,
                                     raw_pdffigures_output=pdffigures_output)