def process_paper_tar(paper_tarname: str) -> None: parts = paper_tarname.split('/') partition_name = parts[-2] paper_name = os.path.splitext(parts[-1])[0] result_path = os.path.join(ARXIV_FIGURE_JSON_DIR, partition_name, paper_name + '.json') paper_dir = os.path.join(ARXIV_SRC_DIR, partition_name, paper_name) if os.path.isfile(result_path): return print('.', end='', flush=True) try: file_util.extract_tarfile(paper_tarname, paper_dir) except tarfile.ReadError: logging.debug('File %s is not a tar' % paper_tarname) return diffs = generate_diffs(paper_dir) if diffs is None: return figures_by_page = dict() for diff in diffs: figures = consume_diff_generate_figures(diff) if figures is None: continue page_name = os.path.dirname( diff) + '/' + diff[diff.find('black.pdf-'):] figures_by_page[page_name] = figures file_util.safe_makedirs(os.path.dirname(result_path)) file_util.write_json_atomic( result_path, config.JsonSerializable.serialize(figures_by_page), sort_keys=True)
def extract_figures_json(pdf_path, page_image_paths, pdffigures_output, output_directory): """Extract information about figures to JSON and save to disk. :param str pdf_path: path to the PDF from which to extract figures. :returns: path to the JSON file containing the detection results. """ page_images_array = np.array( [imread(page_image_path) for page_image_path in page_image_paths]) detector = get_detector() figure_boxes_by_page = detector.get_detections(page_images_array) pdffigures_captions = pdffigures_wrapper.get_captions( pdffigures_output=pdffigures_output, target_dpi=settings.DEFAULT_INFERENCE_DPI) figures_by_page = [] for page_num in range(len(page_image_paths)): figure_boxes = figure_boxes_by_page[page_num] pf_page_captions = [ caption for caption in pdffigures_captions if caption.page == page_num ] caption_boxes = [ caption.caption_boundary for caption in pf_page_captions ] figure_indices, caption_indices = figure_utils.pair_boxes( figure_boxes, caption_boxes) page_image = page_images_array[page_num] pad_pixels = PAD_FACTOR * min(page_image.shape[:2]) for (figure_idx, caption_idx) in zip(figure_indices, caption_indices): figures_by_page.append( Figure(figure_boundary=figure_boxes[figure_idx].expand_box( pad_pixels).crop_to_page( page_image.shape).crop_whitespace_edges(page_image), caption_boundary=caption_boxes[caption_idx], caption_text=pf_page_captions[caption_idx].caption_text, name=pf_page_captions[caption_idx].name, figure_type=pf_page_captions[caption_idx].figure_type, page=page_num)) pdf_detection_result = PdfDetectionResult( pdf=pdf_path, figures=figures_by_page, dpi=settings.DEFAULT_INFERENCE_DPI, raw_detected_boxes=figure_boxes_by_page, raw_pdffigures_output=pdffigures_output) output_path = os.path.join( output_directory, os.path.basename(pdf_path)[:-4] + 'deepfigures-results.json') file_util.write_json_atomic(output_path, pdf_detection_result.to_dict(), indent=2, sort_keys=True) return output_path
def process_paper_tar(self): print("------Processing paper_tarname : {}--------".format( self.paper_tarname)) parts = self.paper_tarname.split('/') partition_name = parts[-2] paper_name = os.path.splitext(parts[-1])[0] result_path = os.path.join(self.ARXIV_FIGURE_JSON_DIR, partition_name, paper_name + '.json') paper_dir = os.path.join(self.ARXIV_SRC_DIR, partition_name, paper_name) if os.path.isfile(result_path): return print('.', end='', flush=True) try: file_util.extract_tarfile(self.paper_tarname, paper_dir) except tarfile.ReadError: logging.debug('File %s is not a tar' % self.paper_tarname) return try: diffs, black_ims_paths = self.generate_diffs(paper_dir) except TypeError: return if diffs is None: return figures_by_page = dict() for idx, diff in enumerate(diffs): figures = self.consume_diff_generate_figures(diff) if figures is None: continue try: figures = self.augment_images(black_ims_paths[idx], figures) except Exception as e: print( "Error augmenting images for image path: {}. Exception message: {}" .format(black_ims_paths[idx], e)) page_name = os.path.dirname( diff) + '/' + diff[diff.find('black.pdf-'):] figures_by_page[page_name] = figures file_util.safe_makedirs(os.path.dirname(result_path)) file_util.write_json_atomic( result_path, config.JsonSerializable.serialize(figures_by_page), sort_keys=True) figure_boundaries, caption_boundaries = transform_figure_json( result_path, self.ignore_pages_with_no_figures) return result_path, figure_boundaries, caption_boundaries
def run_full_pipeline( tarpath: str, skip_done: bool=True, save_intermediate: bool=False ) -> None: foldername = str(os.path.basename(tarpath).split('.')[0]) result_path = LOCAL_FIGURE_JSON_DIR + get_bin( tarpath ) + foldername + '.json' if skip_done and file_util.exists(result_path): return d = LOCAL_INTERMEDIATE_DIR + get_bin(tarpath) while True: try: file_util.extract_tarfile(tarpath, d, streaming=False) # botocore.vendored.requests.packages.urllib3.exceptions.ReadTimeoutError can't be caught because it doesn't # inherit from BaseException, so don't use streaming break except FileNotFoundError as e: logging.exception('Failure reading %s, retrying' % tarpath) except ReadTimeout as e: logging.exception('Timeout reading %s, retrying' % tarpath) pdfs = glob.glob(d + foldername + '/' + '*.pdf') res = dict() for pdf in pdfs: sha1sum = file_util.compute_sha1(pdf) with open(pdf + '.sha1', 'w') as f: print(sha1sum, file=f) paper_figures = match_figures(pdf) if paper_figures is not None: res.update(paper_figures) if save_intermediate: intermediate_path = PUBMED_INTERMEDIATE_DIR + get_bin( tarpath ) + foldername + '/' for file in glob.glob(d + '/' + foldername + '/' + '*'): file_util.copy(file, intermediate_path + os.path.basename(file)) file_util.safe_makedirs(os.path.dirname(result_path)) file_util.write_json_atomic( result_path, config.JsonSerializable.serialize(res), indent=2, sort_keys=True )