class TitleAndAuthorInfo(JsonSerializable): pdf = traits.Unicode() pdf_sha1 = traits.Unicode() image_path = traits.Unicode() title_bounding_box = traits.Instance(BoxClass) title_text = traits.Unicode() authors = traits.List(traits.Instance(AuthorInfo))
class Figure(JsonSerializable): figure_boundary = traits.Instance(BoxClass) caption_boundary = traits.Instance(BoxClass) caption_text = traits.Unicode() name = traits.Unicode() page = traits.Int() figure_type = traits.Unicode() dpi = traits.Int() page_width = traits.Int() page_height = traits.Int() # URI to cropped image of the figure uri = traits.Unicode( default_value=None, allow_none=True) def page_size(self) -> Tuple[int, int]: return self.page_height, self.page_width @staticmethod def from_pf_ann(ann: dict, target_page_size: Tuple[int, int]) -> 'Figure': """Convert an annotation in the pdffigures format""" cur_page_size = ann['page_height'], ann['page_width'] if cur_page_size[0] is None: cur_page_size = [ d * DEFAULT_INFERENCE_DPI / ann['dpi'] for d in target_page_size ] return Figure( figure_boundary=BoxClass.from_tuple(ann['region_bb']) .resize_by_page(cur_page_size, target_page_size), caption_boundary=BoxClass.from_tuple(ann['caption_bb']) .resize_by_page(cur_page_size, target_page_size), caption_text=ann['caption'], name=ann['name'], page=ann['page'], figure_type=ann['figure_type'], page_width=target_page_size[ 1 ], page_height=target_page_size[ 0 ] ) @staticmethod def from_pf_output(res: dict, target_dpi=DEFAULT_INFERENCE_DPI) -> 'Figure': """Convert a pdffigures output figure to a Figure object""" scale_factor = target_dpi / 72 return Figure( figure_boundary=BoxClass.from_dict(res['regionBoundary'] ).rescale(scale_factor), caption_boundary=BoxClass.from_dict(res['captionBoundary']) .rescale(scale_factor), caption_text=res['caption'], name=res['name'], page=res['page'], figure_type=res['figType'] )
class PdfDetectionResult(JsonSerializable): pdf = traits.Unicode() figures = traits.List(traits.Instance(Figure)) dpi = traits.Int() raw_detected_boxes = traits.List( traits.List(traits.Instance(BoxClass)), allow_none=True ) # type: Optional[List[List[BoxClass]]] raw_pdffigures_output = traits.Dict( traits.Any(), allow_none=True ) # type: Optional[dict] error = traits.Unicode( default_value=None, allow_none=True ) # type: Optional[str]
class CaptionOnly(JsonSerializable): caption_boundary = traits.Instance(BoxClass) caption_text = traits.Unicode() name = traits.Unicode() page = traits.Int() figure_type = traits.Unicode() dpi = traits.Int()
class PubmedMatchedFigure(config.JsonSerializable): """ Contains data on a figure extracted from a PMC paper via caption matching with the included nxml file. """ fig_im = traits.Instance(np.ndarray) page_image_name = traits.Unicode() caption = traits.Unicode() name = traits.Unicode() matched_caption = traits.Unicode() html_page = traits.Unicode() start_pos = traits.Int() end_pos = traits.Int() pdf = traits.Unicode() page_num = traits.Int()
class AuthorInfo(JsonSerializable): bounding_box = traits.Instance(BoxClass) name = traits.Unicode()