def process_paper_tar(paper_tarname: str) -> None:
    parts = paper_tarname.split('/')
    partition_name = parts[-2]
    paper_name = os.path.splitext(parts[-1])[0]
    result_path = os.path.join(ARXIV_FIGURE_JSON_DIR, partition_name,
                               paper_name + '.json')
    paper_dir = os.path.join(ARXIV_SRC_DIR, partition_name, paper_name)
    if os.path.isfile(result_path):
        return
    print('.', end='', flush=True)
    try:
        file_util.extract_tarfile(paper_tarname, paper_dir)
    except tarfile.ReadError:
        logging.debug('File %s is not a tar' % paper_tarname)
        return
    diffs = generate_diffs(paper_dir)
    if diffs is None:
        return
    figures_by_page = dict()
    for diff in diffs:
        figures = consume_diff_generate_figures(diff)
        if figures is None:
            continue
        page_name = os.path.dirname(
            diff) + '/' + diff[diff.find('black.pdf-'):]
        figures_by_page[page_name] = figures
    file_util.safe_makedirs(os.path.dirname(result_path))
    file_util.write_json_atomic(
        result_path,
        config.JsonSerializable.serialize(figures_by_page),
        sort_keys=True)
def extract_figures_json(pdf_path, page_image_paths, pdffigures_output,
                         output_directory):
    """Extract information about figures to JSON and save to disk.

    :param str pdf_path: path to the PDF from which to extract
      figures.

    :returns: path to the JSON file containing the detection results.
    """
    page_images_array = np.array(
        [imread(page_image_path) for page_image_path in page_image_paths])
    detector = get_detector()
    figure_boxes_by_page = detector.get_detections(page_images_array)
    pdffigures_captions = pdffigures_wrapper.get_captions(
        pdffigures_output=pdffigures_output,
        target_dpi=settings.DEFAULT_INFERENCE_DPI)
    figures_by_page = []
    for page_num in range(len(page_image_paths)):
        figure_boxes = figure_boxes_by_page[page_num]
        pf_page_captions = [
            caption for caption in pdffigures_captions
            if caption.page == page_num
        ]
        caption_boxes = [
            caption.caption_boundary for caption in pf_page_captions
        ]
        figure_indices, caption_indices = figure_utils.pair_boxes(
            figure_boxes, caption_boxes)
        page_image = page_images_array[page_num]
        pad_pixels = PAD_FACTOR * min(page_image.shape[:2])
        for (figure_idx, caption_idx) in zip(figure_indices, caption_indices):
            figures_by_page.append(
                Figure(figure_boundary=figure_boxes[figure_idx].expand_box(
                    pad_pixels).crop_to_page(
                        page_image.shape).crop_whitespace_edges(page_image),
                       caption_boundary=caption_boxes[caption_idx],
                       caption_text=pf_page_captions[caption_idx].caption_text,
                       name=pf_page_captions[caption_idx].name,
                       figure_type=pf_page_captions[caption_idx].figure_type,
                       page=page_num))
    pdf_detection_result = PdfDetectionResult(
        pdf=pdf_path,
        figures=figures_by_page,
        dpi=settings.DEFAULT_INFERENCE_DPI,
        raw_detected_boxes=figure_boxes_by_page,
        raw_pdffigures_output=pdffigures_output)

    output_path = os.path.join(
        output_directory,
        os.path.basename(pdf_path)[:-4] + 'deepfigures-results.json')
    file_util.write_json_atomic(output_path,
                                pdf_detection_result.to_dict(),
                                indent=2,
                                sort_keys=True)
    return output_path
Example #3
0
 def process_paper_tar(self):
     print("------Processing paper_tarname : {}--------".format(
         self.paper_tarname))
     parts = self.paper_tarname.split('/')
     partition_name = parts[-2]
     paper_name = os.path.splitext(parts[-1])[0]
     result_path = os.path.join(self.ARXIV_FIGURE_JSON_DIR, partition_name,
                                paper_name + '.json')
     paper_dir = os.path.join(self.ARXIV_SRC_DIR, partition_name,
                              paper_name)
     if os.path.isfile(result_path):
         return
     print('.', end='', flush=True)
     try:
         file_util.extract_tarfile(self.paper_tarname, paper_dir)
     except tarfile.ReadError:
         logging.debug('File %s is not a tar' % self.paper_tarname)
         return
     try:
         diffs, black_ims_paths = self.generate_diffs(paper_dir)
     except TypeError:
         return
     if diffs is None:
         return
     figures_by_page = dict()
     for idx, diff in enumerate(diffs):
         figures = self.consume_diff_generate_figures(diff)
         if figures is None:
             continue
         try:
             figures = self.augment_images(black_ims_paths[idx], figures)
         except Exception as e:
             print(
                 "Error augmenting images for image path: {}. Exception message: {}"
                 .format(black_ims_paths[idx], e))
         page_name = os.path.dirname(
             diff) + '/' + diff[diff.find('black.pdf-'):]
         figures_by_page[page_name] = figures
     file_util.safe_makedirs(os.path.dirname(result_path))
     file_util.write_json_atomic(
         result_path,
         config.JsonSerializable.serialize(figures_by_page),
         sort_keys=True)
     figure_boundaries, caption_boundaries = transform_figure_json(
         result_path, self.ignore_pages_with_no_figures)
     return result_path, figure_boundaries, caption_boundaries
Example #4
0
def run_full_pipeline(
    tarpath: str, skip_done: bool=True, save_intermediate: bool=False
) -> None:
    foldername = str(os.path.basename(tarpath).split('.')[0])
    result_path = LOCAL_FIGURE_JSON_DIR + get_bin(
        tarpath
    ) + foldername + '.json'
    if skip_done and file_util.exists(result_path):
        return
    d = LOCAL_INTERMEDIATE_DIR + get_bin(tarpath)
    while True:
        try:
            file_util.extract_tarfile(tarpath, d, streaming=False)
            # botocore.vendored.requests.packages.urllib3.exceptions.ReadTimeoutError can't be caught because it doesn't
            # inherit from BaseException, so don't use streaming
            break
        except FileNotFoundError as e:
            logging.exception('Failure reading %s, retrying' % tarpath)
        except ReadTimeout as e:
            logging.exception('Timeout reading %s, retrying' % tarpath)
    pdfs = glob.glob(d + foldername + '/' + '*.pdf')
    res = dict()
    for pdf in pdfs:
        sha1sum = file_util.compute_sha1(pdf)
        with open(pdf + '.sha1', 'w') as f:
            print(sha1sum, file=f)
        paper_figures = match_figures(pdf)
        if paper_figures is not None:
            res.update(paper_figures)
    if save_intermediate:
        intermediate_path = PUBMED_INTERMEDIATE_DIR + get_bin(
            tarpath
        ) + foldername + '/'
        for file in glob.glob(d + '/' + foldername + '/' + '*'):
            file_util.copy(file, intermediate_path + os.path.basename(file))
    file_util.safe_makedirs(os.path.dirname(result_path))
    file_util.write_json_atomic(
        result_path,
        config.JsonSerializable.serialize(res),
        indent=2,
        sort_keys=True
    )