def test_merge_bounding_boxes(): s = symbol(characters=[0, 1]) character_locations = { character_id(0): [ PdfBoundingBox(0, 0, 10, 10, 0), # Expand the bounding box downward 10 pixels PdfBoundingBox(0, 10, 10, 10, 0), ], # Expand the bounding box rightward 10 pixels character_id(1): [PdfBoundingBox(10, 0, 10, 10, 0)], # Ignore this bounding box for an irrelevant character character_id(2): [PdfBoundingBox(20, 0, 10, 10, 0)], } box = get_symbol_bounding_box(s, symbol_id(), character_locations) assert box == PdfBoundingBox(0, 0, 20, 20, 0)
def get_symbol_bounding_box( symbol: Symbol, symbol_id: SymbolId, character_boxes: CharacterLocations) -> Optional[PdfBoundingBox]: boxes = [] for character_index in symbol.characters: character_id = CharacterId(symbol_id.tex_path, symbol_id.equation_index, character_index) boxes.extend(character_boxes.get(character_id, [])) if len(boxes) == 0: return None # Boxes for a symbol should be on only one page. if len({box.page for box in boxes}) > 1: logging.warning( ("Boxes found on more than one page for symbol %s. " + "Only the boxes for one page will be considered."), symbol, ) page = boxes[0].page boxes_on_page = list(filter(lambda b: b.page == page, boxes)) left = min([box.left for box in boxes_on_page]) right = max([box.left + box.width for box in boxes_on_page]) top = min([box.top for box in boxes_on_page]) bottom = max([box.top + box.height for box in boxes_on_page]) return PdfBoundingBox(left, top, right - left, bottom - top, page)
def _to_pdf_coordinates( bounding_box: Rectangle, image_width: int, image_height: int, pdf_page_width: float, pdf_page_height: float, page: int, ) -> PdfBoundingBox: left = bounding_box.left top = bounding_box.top right = bounding_box.left + bounding_box.width bottom = bounding_box.top + bounding_box.height pdf_left = left * (pdf_page_width / image_width) pdf_right = right * (pdf_page_width / image_width) # Set PDF coordinates relative to the document bottom. Because image coordinates are relative # to the image's top, flip the y-coordinates. pdf_top = pdf_page_height - (top * (pdf_page_height / image_height)) pdf_bottom = pdf_page_height - (bottom * (pdf_page_height / image_height)) return PdfBoundingBox( left=pdf_left, top=pdf_top, width=pdf_right - pdf_left, height=pdf_top - pdf_bottom, page=page, )
def common_load_bounding_boxes( hue_locations_dir_path: str, ) -> Dict[str, List[PdfBoundingBoxAndHue]]: box_data_path = os.path.join(hue_locations_dir_path, "hue_locations.csv") if not os.path.exists(box_data_path): logging.warning( "Could not find any bounding box data in directory %s", hue_locations_dir_path, ) return {} boxes: Dict[str, List[PdfBoundingBoxAndHue]] = {} with open(box_data_path) as box_data_file: reader = csv.reader(box_data_file) for row in reader: pdf_path = row[0] hue = float(row[2]) box = PdfBoundingBox( page=int(row[3]), left=float(row[4]), top=float(row[5]), width=float(row[6]), height=float(row[7]), ) if not pdf_path in boxes: boxes[pdf_path] = [] boxes[pdf_path].append(PdfBoundingBoxAndHue(hue, box)) return boxes
def load(self) -> Iterator[SymbolData]: for arxiv_id in get_arxiv_ids(SOURCES_DIR): s2_id = get_s2_id(arxiv_id) if s2_id is None: continue symbols_with_ids = load_symbols(arxiv_id) if symbols_with_ids is None: continue boxes: Dict[SymbolId, PdfBoundingBox] = {} boxes_path = os.path.join( directories.symbol_locations(arxiv_id), "symbol_locations.csv" ) if not os.path.exists(boxes_path): logging.warning( "Could not find bounding boxes information for %s. Skipping", arxiv_id, ) continue with open(boxes_path) as boxes_file: reader = csv.reader(boxes_file) for row in reader: symbol_id = SymbolId( tex_path=row[0], equation_index=int(row[1]), symbol_index=int(row[2]), ) box = PdfBoundingBox( page=int(row[3]), left=float(row[4]), top=float(row[5]), width=float(row[6]), height=float(row[7]), ) boxes[symbol_id] = box matches: Matches = {} matches_path = os.path.join( directories.symbol_matches(arxiv_id), "matches.csv" ) if not os.path.exists(matches_path): logging.warning( "Could not find symbol matches information for %s. Skipping", arxiv_id, ) continue with open(matches_path) as matches_file: reader = csv.reader(matches_file) for row in reader: mathml = row[0] match_mathml = row[1] rank = int(row[2]) if mathml not in matches: matches[mathml] = [] matches[mathml].append(Match(match_mathml, rank)) yield SymbolData(arxiv_id, s2_id, symbols_with_ids, boxes, matches)
def load(self) -> Iterator[LocationTask]: for arxiv_id in get_arxiv_ids( directories.HUE_LOCATIONS_FOR_EQUATION_TOKENS_DIR): output_dir = directories.symbol_locations(arxiv_id) clean_directory(output_dir) token_locations: Dict[CharacterId, List[PdfBoundingBox]] = {} token_locations_path = os.path.join( directories.hue_locations_for_equation_tokens(arxiv_id), "hue_locations.csv", ) if not os.path.exists(token_locations_path): logging.warning( "Could not find bounding boxes information for %s. Skipping", arxiv_id, ) continue with open(token_locations_path) as token_locations_file: reader = csv.reader(token_locations_file) for row in reader: tex_path = row[-3] equation_index = int(row[-2]) character_index = int(row[-1]) character_id = CharacterId(tex_path, equation_index, character_index) box = PdfBoundingBox( page=int(row[3]), left=float(row[4]), top=float(row[5]), width=float(row[6]), height=float(row[7]), ) if character_id not in token_locations: token_locations[character_id] = [] token_locations[character_id].append(box) symbols_with_ids = load_symbols(arxiv_id) if symbols_with_ids is None: continue for symbol_with_id in symbols_with_ids: yield LocationTask( arxiv_id=arxiv_id, character_locations=token_locations, symbol_with_id=symbol_with_id, )
def test_get_character_bounding_box(): s = symbol(characters=[0]) character_locations = {character_id(0): [PdfBoundingBox(0, 0, 10, 10, 0)]} box = get_symbol_bounding_box(s, symbol_id(), character_locations) assert box == PdfBoundingBox(0, 0, 10, 10, 0)
def load(self) -> Iterator[CitationData]: for arxiv_id in get_arxiv_ids(SOURCES_DIR): boxes_by_hue_iteration: Dict[HueIteration, List[PdfBoundingBox]] = {} bounding_boxes_path = os.path.join( directories.hue_locations_for_citations(arxiv_id), "hue_locations.csv") if not os.path.exists(bounding_boxes_path): logging.warning( "Could not find bounding boxes information for %s. Skipping", arxiv_id, ) continue with open(bounding_boxes_path) as bounding_boxes_file: reader = csv.reader(bounding_boxes_file) for row in reader: iteration = row[1] hue = float(row[2]) box = PdfBoundingBox( page=int(row[3]), left=float(row[4]), top=float(row[5]), width=float(row[6]), height=float(row[7]), ) hue_iteration = HueIteration(hue, iteration) if hue not in boxes_by_hue_iteration: boxes_by_hue_iteration[hue_iteration] = [] boxes_by_hue_iteration[hue_iteration].append(box) citations_by_hue_iteration: Dict[HueIteration, CitationKeys] = {} for iteration in get_iteration_names( directories.SOURCES_WITH_COLORIZED_CITATIONS_DIR, arxiv_id): citation_hues_path = os.path.join( get_data_subdirectory_for_iteration( directories.SOURCES_WITH_COLORIZED_CITATIONS_DIR, arxiv_id, iteration, ), "citation_hues.csv", ) if not os.path.exists(citation_hues_path): logging.warning( "Could not find citation hue colors for %s iteration %s. Skipping", arxiv_id, iteration, ) continue with open(citation_hues_path) as citation_hues_file: reader = csv.reader(citation_hues_file) for row in reader: citation_keys = cast(Tuple[str], tuple(json.loads(row[3]))) hue = float(row[2]) iteration = row[1] hue_iteration = HueIteration(hue, iteration) citations_by_hue_iteration[ hue_iteration] = citation_keys key_s2_ids: Dict[CitationKey, S2Id] = {} key_resolutions_path = os.path.join( directories.bibitem_resolutions(arxiv_id), "resolutions.csv") if not os.path.exists(key_resolutions_path): logging.warning( "Could not find citation resolutions for %s. Skipping", arxiv_id) continue with open(key_resolutions_path) as key_resolutions_file: reader = csv.reader(key_resolutions_file) for row in reader: key = row[0] s2_id = row[1] key_s2_ids[key] = s2_id s2_id_path = os.path.join(directories.s2_metadata(arxiv_id), "s2_id") if not os.path.exists(s2_id_path): logging.warning("Could not find S2 ID file for %s. Skipping", arxiv_id) continue with open(s2_id_path) as s2_id_file: s2_id = s2_id_file.read() s2_data: Dict[S2Id, Reference] = {} s2_metadata_path = os.path.join(directories.s2_metadata(arxiv_id), "references.csv") if not os.path.exists(s2_metadata_path): logging.warning( "Could not find S2 metadata file for citations for %s. Skipping", arxiv_id, ) continue with open(s2_metadata_path) as s2_metadata_file: reader = csv.reader(s2_metadata_file) for row in reader: s2_data[row[0]] = Reference( s2Id=row[0], arxivId=row[1] if row[1] is not "" else None, doi=row[2] if row[2] is not "" else None, title=row[3], authors=[ Author(id=None, name=nm) for nm in row[4].split(",") ], venue=row[5], year=int(row[6]) if row[6] is not "" else None, ) yield CitationData( arxiv_id, s2_id, boxes_by_hue_iteration, citations_by_hue_iteration, key_s2_ids, s2_data, )