def test_merge_bounding_boxes():
    s = symbol(characters=[0, 1])
    character_locations = {
        character_id(0): [
            PdfBoundingBox(0, 0, 10, 10, 0),
            # Expand the bounding box downward 10 pixels
            PdfBoundingBox(0, 10, 10, 10, 0),
        ],
        # Expand the bounding box rightward 10 pixels
        character_id(1): [PdfBoundingBox(10, 0, 10, 10, 0)],
        # Ignore this bounding box for an irrelevant character
        character_id(2): [PdfBoundingBox(20, 0, 10, 10, 0)],
    }
    box = get_symbol_bounding_box(s, symbol_id(), character_locations)
    assert box == PdfBoundingBox(0, 0, 20, 20, 0)
Esempio n. 2
0
def get_symbol_bounding_box(
        symbol: Symbol, symbol_id: SymbolId,
        character_boxes: CharacterLocations) -> Optional[PdfBoundingBox]:
    boxes = []
    for character_index in symbol.characters:
        character_id = CharacterId(symbol_id.tex_path,
                                   symbol_id.equation_index, character_index)
        boxes.extend(character_boxes.get(character_id, []))

    if len(boxes) == 0:
        return None

    # Boxes for a symbol should be on only one page.
    if len({box.page for box in boxes}) > 1:
        logging.warning(
            ("Boxes found on more than one page for symbol %s. " +
             "Only the boxes for one page will be considered."),
            symbol,
        )

    page = boxes[0].page
    boxes_on_page = list(filter(lambda b: b.page == page, boxes))

    left = min([box.left for box in boxes_on_page])
    right = max([box.left + box.width for box in boxes_on_page])
    top = min([box.top for box in boxes_on_page])
    bottom = max([box.top + box.height for box in boxes_on_page])

    return PdfBoundingBox(left, top, right - left, bottom - top, page)
Esempio n. 3
0
def _to_pdf_coordinates(
    bounding_box: Rectangle,
    image_width: int,
    image_height: int,
    pdf_page_width: float,
    pdf_page_height: float,
    page: int,
) -> PdfBoundingBox:
    left = bounding_box.left
    top = bounding_box.top
    right = bounding_box.left + bounding_box.width
    bottom = bounding_box.top + bounding_box.height
    pdf_left = left * (pdf_page_width / image_width)
    pdf_right = right * (pdf_page_width / image_width)
    # Set PDF coordinates relative to the document bottom. Because image coordinates are relative
    # to the image's top, flip the y-coordinates.
    pdf_top = pdf_page_height - (top * (pdf_page_height / image_height))
    pdf_bottom = pdf_page_height - (bottom * (pdf_page_height / image_height))
    return PdfBoundingBox(
        left=pdf_left,
        top=pdf_top,
        width=pdf_right - pdf_left,
        height=pdf_top - pdf_bottom,
        page=page,
    )
def common_load_bounding_boxes(
    hue_locations_dir_path: str, ) -> Dict[str, List[PdfBoundingBoxAndHue]]:
    box_data_path = os.path.join(hue_locations_dir_path, "hue_locations.csv")

    if not os.path.exists(box_data_path):
        logging.warning(
            "Could not find any bounding box data in directory %s",
            hue_locations_dir_path,
        )
        return {}

    boxes: Dict[str, List[PdfBoundingBoxAndHue]] = {}
    with open(box_data_path) as box_data_file:
        reader = csv.reader(box_data_file)
        for row in reader:
            pdf_path = row[0]
            hue = float(row[2])
            box = PdfBoundingBox(
                page=int(row[3]),
                left=float(row[4]),
                top=float(row[5]),
                width=float(row[6]),
                height=float(row[7]),
            )
            if not pdf_path in boxes:
                boxes[pdf_path] = []
            boxes[pdf_path].append(PdfBoundingBoxAndHue(hue, box))
    return boxes
    def load(self) -> Iterator[SymbolData]:
        for arxiv_id in get_arxiv_ids(SOURCES_DIR):

            s2_id = get_s2_id(arxiv_id)
            if s2_id is None:
                continue

            symbols_with_ids = load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue

            boxes: Dict[SymbolId, PdfBoundingBox] = {}
            boxes_path = os.path.join(
                directories.symbol_locations(arxiv_id), "symbol_locations.csv"
            )
            if not os.path.exists(boxes_path):
                logging.warning(
                    "Could not find bounding boxes information for %s. Skipping",
                    arxiv_id,
                )
                continue
            with open(boxes_path) as boxes_file:
                reader = csv.reader(boxes_file)
                for row in reader:
                    symbol_id = SymbolId(
                        tex_path=row[0],
                        equation_index=int(row[1]),
                        symbol_index=int(row[2]),
                    )
                    box = PdfBoundingBox(
                        page=int(row[3]),
                        left=float(row[4]),
                        top=float(row[5]),
                        width=float(row[6]),
                        height=float(row[7]),
                    )
                    boxes[symbol_id] = box

            matches: Matches = {}
            matches_path = os.path.join(
                directories.symbol_matches(arxiv_id), "matches.csv"
            )
            if not os.path.exists(matches_path):
                logging.warning(
                    "Could not find symbol matches information for %s. Skipping",
                    arxiv_id,
                )
                continue
            with open(matches_path) as matches_file:
                reader = csv.reader(matches_file)
                for row in reader:
                    mathml = row[0]
                    match_mathml = row[1]
                    rank = int(row[2])
                    if mathml not in matches:
                        matches[mathml] = []
                    matches[mathml].append(Match(match_mathml, rank))

            yield SymbolData(arxiv_id, s2_id, symbols_with_ids, boxes, matches)
    def load(self) -> Iterator[LocationTask]:

        for arxiv_id in get_arxiv_ids(
                directories.HUE_LOCATIONS_FOR_EQUATION_TOKENS_DIR):

            output_dir = directories.symbol_locations(arxiv_id)
            clean_directory(output_dir)

            token_locations: Dict[CharacterId, List[PdfBoundingBox]] = {}
            token_locations_path = os.path.join(
                directories.hue_locations_for_equation_tokens(arxiv_id),
                "hue_locations.csv",
            )
            if not os.path.exists(token_locations_path):
                logging.warning(
                    "Could not find bounding boxes information for %s. Skipping",
                    arxiv_id,
                )
                continue
            with open(token_locations_path) as token_locations_file:
                reader = csv.reader(token_locations_file)
                for row in reader:
                    tex_path = row[-3]
                    equation_index = int(row[-2])
                    character_index = int(row[-1])
                    character_id = CharacterId(tex_path, equation_index,
                                               character_index)
                    box = PdfBoundingBox(
                        page=int(row[3]),
                        left=float(row[4]),
                        top=float(row[5]),
                        width=float(row[6]),
                        height=float(row[7]),
                    )
                    if character_id not in token_locations:
                        token_locations[character_id] = []
                    token_locations[character_id].append(box)

            symbols_with_ids = load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue

            for symbol_with_id in symbols_with_ids:
                yield LocationTask(
                    arxiv_id=arxiv_id,
                    character_locations=token_locations,
                    symbol_with_id=symbol_with_id,
                )
def test_get_character_bounding_box():
    s = symbol(characters=[0])
    character_locations = {character_id(0): [PdfBoundingBox(0, 0, 10, 10, 0)]}
    box = get_symbol_bounding_box(s, symbol_id(), character_locations)
    assert box == PdfBoundingBox(0, 0, 10, 10, 0)
    def load(self) -> Iterator[CitationData]:
        for arxiv_id in get_arxiv_ids(SOURCES_DIR):

            boxes_by_hue_iteration: Dict[HueIteration,
                                         List[PdfBoundingBox]] = {}
            bounding_boxes_path = os.path.join(
                directories.hue_locations_for_citations(arxiv_id),
                "hue_locations.csv")
            if not os.path.exists(bounding_boxes_path):
                logging.warning(
                    "Could not find bounding boxes information for %s. Skipping",
                    arxiv_id,
                )
                continue
            with open(bounding_boxes_path) as bounding_boxes_file:
                reader = csv.reader(bounding_boxes_file)
                for row in reader:
                    iteration = row[1]
                    hue = float(row[2])
                    box = PdfBoundingBox(
                        page=int(row[3]),
                        left=float(row[4]),
                        top=float(row[5]),
                        width=float(row[6]),
                        height=float(row[7]),
                    )
                    hue_iteration = HueIteration(hue, iteration)
                    if hue not in boxes_by_hue_iteration:
                        boxes_by_hue_iteration[hue_iteration] = []
                    boxes_by_hue_iteration[hue_iteration].append(box)

            citations_by_hue_iteration: Dict[HueIteration, CitationKeys] = {}
            for iteration in get_iteration_names(
                    directories.SOURCES_WITH_COLORIZED_CITATIONS_DIR,
                    arxiv_id):
                citation_hues_path = os.path.join(
                    get_data_subdirectory_for_iteration(
                        directories.SOURCES_WITH_COLORIZED_CITATIONS_DIR,
                        arxiv_id,
                        iteration,
                    ),
                    "citation_hues.csv",
                )
                if not os.path.exists(citation_hues_path):
                    logging.warning(
                        "Could not find citation hue colors for %s iteration %s. Skipping",
                        arxiv_id,
                        iteration,
                    )
                    continue
                with open(citation_hues_path) as citation_hues_file:
                    reader = csv.reader(citation_hues_file)
                    for row in reader:
                        citation_keys = cast(Tuple[str],
                                             tuple(json.loads(row[3])))
                        hue = float(row[2])
                        iteration = row[1]
                        hue_iteration = HueIteration(hue, iteration)
                        citations_by_hue_iteration[
                            hue_iteration] = citation_keys

            key_s2_ids: Dict[CitationKey, S2Id] = {}
            key_resolutions_path = os.path.join(
                directories.bibitem_resolutions(arxiv_id), "resolutions.csv")
            if not os.path.exists(key_resolutions_path):
                logging.warning(
                    "Could not find citation resolutions for %s. Skipping",
                    arxiv_id)
                continue
            with open(key_resolutions_path) as key_resolutions_file:
                reader = csv.reader(key_resolutions_file)
                for row in reader:
                    key = row[0]
                    s2_id = row[1]
                    key_s2_ids[key] = s2_id

            s2_id_path = os.path.join(directories.s2_metadata(arxiv_id),
                                      "s2_id")
            if not os.path.exists(s2_id_path):
                logging.warning("Could not find S2 ID file for %s. Skipping",
                                arxiv_id)
                continue
            with open(s2_id_path) as s2_id_file:
                s2_id = s2_id_file.read()

            s2_data: Dict[S2Id, Reference] = {}
            s2_metadata_path = os.path.join(directories.s2_metadata(arxiv_id),
                                            "references.csv")
            if not os.path.exists(s2_metadata_path):
                logging.warning(
                    "Could not find S2 metadata file for citations for %s. Skipping",
                    arxiv_id,
                )
                continue
            with open(s2_metadata_path) as s2_metadata_file:
                reader = csv.reader(s2_metadata_file)
                for row in reader:
                    s2_data[row[0]] = Reference(
                        s2Id=row[0],
                        arxivId=row[1] if row[1] is not "" else None,
                        doi=row[2] if row[2] is not "" else None,
                        title=row[3],
                        authors=[
                            Author(id=None, name=nm)
                            for nm in row[4].split(",")
                        ],
                        venue=row[5],
                        year=int(row[6]) if row[6] is not "" else None,
                    )

            yield CitationData(
                arxiv_id,
                s2_id,
                boxes_by_hue_iteration,
                citations_by_hue_iteration,
                key_s2_ids,
                s2_data,
            )