Esempio n. 1
0
class TitleAndAuthorInfo(JsonSerializable):
    pdf = traits.Unicode()
    pdf_sha1 = traits.Unicode()
    image_path = traits.Unicode()
    title_bounding_box = traits.Instance(BoxClass)
    title_text = traits.Unicode()
    authors = traits.List(traits.Instance(AuthorInfo))
Esempio n. 2
0
class CaptionOnly(JsonSerializable):
    caption_boundary = traits.Instance(BoxClass)
    caption_text = traits.Unicode()
    name = traits.Unicode()
    page = traits.Int()
    figure_type = traits.Unicode()
    dpi = traits.Int()
Esempio n. 3
0
class Figure(JsonSerializable):
    figure_boundary = traits.Instance(BoxClass)
    caption_boundary = traits.Instance(BoxClass)
    caption_text = traits.Unicode()
    name = traits.Unicode()
    page = traits.Int()
    figure_type = traits.Unicode()
    dpi = traits.Int()
    page_width = traits.Int()
    page_height = traits.Int()
    # URI to cropped image of the figure
    uri = traits.Unicode(
        default_value=None, allow_none=True)

    def page_size(self) -> Tuple[int, int]:
        return self.page_height, self.page_width

    @staticmethod
    def from_pf_ann(ann: dict, target_page_size: Tuple[int, int]) -> 'Figure':
        """Convert an annotation in the pdffigures format"""
        cur_page_size = ann['page_height'], ann['page_width']
        if cur_page_size[0] is None:
            cur_page_size = [
                d * DEFAULT_INFERENCE_DPI / ann['dpi'] for d in target_page_size
            ]
        return Figure(
            figure_boundary=BoxClass.from_tuple(ann['region_bb'])
            .resize_by_page(cur_page_size, target_page_size),
            caption_boundary=BoxClass.from_tuple(ann['caption_bb'])
            .resize_by_page(cur_page_size, target_page_size),
            caption_text=ann['caption'],
            name=ann['name'],
            page=ann['page'],
            figure_type=ann['figure_type'],
            page_width=target_page_size[
                1
            ],
            page_height=target_page_size[
                0
            ]
        )

    @staticmethod
    def from_pf_output(res: dict, target_dpi=DEFAULT_INFERENCE_DPI) -> 'Figure':
        """Convert a pdffigures output figure to a Figure object"""
        scale_factor = target_dpi / 72
        return Figure(
            figure_boundary=BoxClass.from_dict(res['regionBoundary']
                                              ).rescale(scale_factor),
            caption_boundary=BoxClass.from_dict(res['captionBoundary'])
            .rescale(scale_factor),
            caption_text=res['caption'],
            name=res['name'],
            page=res['page'],
            figure_type=res['figType']
        )
Esempio n. 4
0
class PdfDetectionResult(JsonSerializable):
    pdf = traits.Unicode()
    figures = traits.List(traits.Instance(Figure))
    dpi = traits.Int()
    raw_detected_boxes = traits.List(
        traits.List(traits.Instance(BoxClass)), allow_none=True
    )  # type: Optional[List[List[BoxClass]]]
    raw_pdffigures_output = traits.Dict(
        traits.Any(), allow_none=True
    )  # type: Optional[dict]
    error = traits.Unicode(
        default_value=None, allow_none=True
    )  # type: Optional[str]
Esempio n. 5
0
class PubmedMatchedFigure(config.JsonSerializable):
    """
    Contains data on a figure extracted from a PMC paper via caption matching with the included nxml file.
    """
    fig_im = traits.Instance(np.ndarray)
    page_image_name = traits.Unicode()
    caption = traits.Unicode()
    name = traits.Unicode()
    matched_caption = traits.Unicode()
    html_page = traits.Unicode()
    start_pos = traits.Int()
    end_pos = traits.Int()
    pdf = traits.Unicode()
    page_num = traits.Int()
Esempio n. 6
0
class AuthorInfo(JsonSerializable):
    bounding_box = traits.Instance(BoxClass)
    name = traits.Unicode()