def load(self) -> Iterator[MatchTask]: for arxiv_id in os.listdir(BIBITEMS_DIR): clean_directory(directories.bibitem_resolutions(arxiv_id)) bibitems_dir = directories.bibitems(arxiv_id) metadata_dir = directories.s2_metadata(arxiv_id) references = [] references_path = os.path.join(metadata_dir, "references.csv") with open(references_path) as references_file: reader = csv.reader(references_file) for row in reader: references.append( Reference( s2Id=row[0], arxivId=row[1], doi=row[2], title=row[3], authors=[ Author(id=None, name=name) for name in row[4].split(", ") ], venue=row[5], year=int(row[6]) if row[6].isspace() else None, )) bibitems = [] bibitems_path = os.path.join(bibitems_dir, "bibitems.csv") with open(bibitems_path) as bibitems_file: reader = csv.reader(bibitems_file) for row in reader: bibitems.append(Bibitem(key=row[0], text=row[1])) yield MatchTask(arxiv_id, bibitems, references)
def load(self) -> Iterator[TexAndTokens]: for arxiv_id in get_arxiv_ids(directories.SOURCES_DIR): output_root = get_data_subdirectory_for_arxiv_id( directories.SOURCES_WITH_COLORIZED_EQUATION_TOKENS_DIR, arxiv_id) clean_directory(output_root) tokens_path = os.path.join(directories.symbols(arxiv_id), "tokens.csv") if not os.path.exists(tokens_path): logging.info( "No equation token data found for paper %s. Skipping.", arxiv_id) continue # Load token location information tokens = load_tokens(arxiv_id) if tokens is None: continue tex_paths = set({token.tex_path for token in tokens}) # Load original sources for TeX files that need to be colorized contents_by_file = {} for tex_path in tex_paths: absolute_tex_path = os.path.join(directories.sources(arxiv_id), tex_path) contents = read_file_tolerant(absolute_tex_path) if contents is not None: contents_by_file[tex_path] = contents yield TexAndTokens(arxiv_id, contents_by_file, tokens)
def load(self) -> Iterator[FileContents]: for arxiv_id in get_arxiv_ids(SOURCES_DIR): sources_dir = sources(arxiv_id) clean_directory(directories.bibitems(arxiv_id)) for path in find_files(sources_dir, [".tex", ".bbl"]): contents = read_file_tolerant(path) if contents is None: continue yield FileContents(arxiv_id, path, contents)
def unpack(arxiv_id: str, unpack_path: str) -> Optional[str]: archive_path = directories.source_archives(arxiv_id) if not os.path.exists(archive_path): logging.warning("No source archive directory found for %s", arxiv_id) return None if os.path.exists(unpack_path): logging.warning("Directory already found at %s. Deleting contents.", unpack_path) clean_directory(unpack_path) _unpack(archive_path, unpack_path) return unpack_path
def load(self) -> Iterator[FileContents]: for arxiv_id in get_arxiv_ids(directories.SOURCES_DIR): output_root = get_data_subdirectory_for_arxiv_id( directories.SOURCES_WITH_COLORIZED_CITATIONS_DIR, arxiv_id) clean_directory(output_root) original_sources_path = directories.sources(arxiv_id) for text_path in find_files(original_sources_path, [".tex"], relative=True): contents = read_file_tolerant( os.path.join(original_sources_path, text_path)) if contents is not None: yield FileContents(arxiv_id, text_path, contents)
def load(self) -> Iterator[MathMLForPaper]: for arxiv_id in get_arxiv_ids(directories.SYMBOLS_DIR): output_dir = directories.symbol_matches(arxiv_id) clean_directory(output_dir) symbols_with_ids = load_symbols(arxiv_id) if symbols_with_ids is None: continue symbols_mathml = {swi.symbol.mathml for swi in symbols_with_ids} yield MathMLForPaper(arxiv_id=arxiv_id, mathml_equations=symbols_mathml)
def load(self) -> Iterator[LocationTask]: for arxiv_id in get_arxiv_ids( directories.HUE_LOCATIONS_FOR_EQUATION_TOKENS_DIR): output_dir = directories.symbol_locations(arxiv_id) clean_directory(output_dir) token_locations: Dict[CharacterId, List[PdfBoundingBox]] = {} token_locations_path = os.path.join( directories.hue_locations_for_equation_tokens(arxiv_id), "hue_locations.csv", ) if not os.path.exists(token_locations_path): logging.warning( "Could not find bounding boxes information for %s. Skipping", arxiv_id, ) continue with open(token_locations_path) as token_locations_file: reader = csv.reader(token_locations_file) for row in reader: tex_path = row[-3] equation_index = int(row[-2]) character_index = int(row[-1]) character_id = CharacterId(tex_path, equation_index, character_index) box = PdfBoundingBox( page=int(row[3]), left=float(row[4]), top=float(row[5]), width=float(row[6]), height=float(row[7]), ) if character_id not in token_locations: token_locations[character_id] = [] token_locations[character_id].append(box) symbols_with_ids = load_symbols(arxiv_id) if symbols_with_ids is None: continue for symbol_with_id in symbols_with_ids: yield LocationTask( arxiv_id=arxiv_id, character_locations=token_locations, symbol_with_id=symbol_with_id, )
def load(self) -> Iterator[PageRasterPair]: for arxiv_id in get_arxiv_ids(directories.PAPER_IMAGES_DIR): output_dir = get_data_subdirectory_for_arxiv_id( self.get_output_base_dir(), arxiv_id ) clean_directory(output_dir) # Get PDF names from results of compiling the uncolorized TeX sources. pdf_paths = get_compiled_pdfs(directories.compilation_results(arxiv_id)) if len(pdf_paths) == 0: continue for iteration in get_iteration_names(self.get_raster_base_dir(), arxiv_id): original_images_dir = directories.paper_images(arxiv_id) modified_images_dir = get_data_subdirectory_for_iteration( self.get_raster_base_dir(), arxiv_id, iteration ) for relative_pdf_path in pdf_paths: original_pdf_images_path = os.path.join( original_images_dir, relative_pdf_path ) for img_name in os.listdir(original_pdf_images_path): original_img_path = os.path.join( original_pdf_images_path, img_name ) modified_img_path = os.path.join( modified_images_dir, relative_pdf_path, img_name ) if not os.path.exists(modified_img_path): logging.warning( "Could not find expected image %s. Skipping diff for this paper.", modified_img_path, ) break original_img = cv2.imread(original_img_path) modified_img = cv2.imread(modified_img_path) yield PageRasterPair( arxiv_id, iteration, relative_pdf_path, img_name, original_img, modified_img, )
def load(self) -> Iterator[RasterTask]: papers_base_dir = self.get_papers_base_dir() for arxiv_id in get_arxiv_ids(papers_base_dir): # Clean all past output for this arXiv ID. output_dir_for_arxiv_id = directories.get_data_subdirectory_for_arxiv_id( self.get_output_base_dir(), arxiv_id) clean_directory(output_dir_for_arxiv_id) for paper_dir in self.get_paper_dirs(arxiv_id): paper_abs_path = os.path.join(self.get_papers_base_dir(), paper_dir) pdf_paths = get_compiled_pdfs(paper_abs_path) for path in pdf_paths: yield RasterTask(paper_dir, path, os.path.join(paper_abs_path, path))
def load(self) -> Iterator[EquationInfo]: colorized_equations_base_dir = directories.SOURCES_WITH_COLORIZED_EQUATIONS_DIR for arxiv_id in get_arxiv_ids(colorized_equations_base_dir): clean_directory(directories.equations(arxiv_id)) for iteration in get_iteration_names(colorized_equations_base_dir, arxiv_id): colorized_sources_dir = get_data_subdirectory_for_iteration( colorized_equations_base_dir, arxiv_id, iteration) equation_hues_path = os.path.join(colorized_sources_dir, "equation_hues.csv") with open(equation_hues_path) as equation_hues_file: reader = csv.reader(equation_hues_file) for row in reader: yield EquationInfo(arxiv_id, row[0], int(row[1]), row[4])
def load(self) -> Iterator[TexAndSymbols]: for arxiv_id in get_arxiv_ids(directories.SOURCES_DIR): output_root = get_data_subdirectory_for_arxiv_id( directories.SOURCES_WITH_ANNOTATED_SYMBOLS, arxiv_id) clean_directory(output_root) symbols_dir = directories.symbols(arxiv_id) tokens_path = os.path.join(symbols_dir, "tokens.csv") if not os.path.exists(tokens_path): logging.info( "No equation token data found for paper %s. Skipping.", arxiv_id) continue symbols_with_ids = load_symbols(arxiv_id) if symbols_with_ids is None: continue symbols = {swi.symbol_id: swi.symbol for swi in symbols_with_ids} tokens = load_tokens(arxiv_id) if tokens is None: continue tex_paths = set({t.tex_path for t in tokens}) characters: Dict[CharacterId, Character] = {} for token in tokens: character_id = CharacterId(token.tex_path, token.equation_index, token.token_index) characters[character_id] = Character(token.text, token.token_index, token.start, token.end) # Load original sources for TeX files that need to be colorized contents_by_file = {} for tex_path in tex_paths: absolute_tex_path = os.path.join(directories.sources(arxiv_id), tex_path) contents = read_file_tolerant(absolute_tex_path) if contents is not None: contents_by_file[tex_path] = contents yield TexAndSymbols(arxiv_id, contents_by_file, symbols, characters)
def load(self) -> Iterator[PdfAndBoxes]: for arxiv_id in get_arxiv_ids(directories.COMPILED_SOURCES_DIR): output_dir = get_data_subdirectory_for_arxiv_id( self.get_output_base_dir(), arxiv_id) clean_directory(output_dir) boxes_and_hues = self.load_bounding_boxes(arxiv_id) pdf_paths = get_compiled_pdfs( directories.compilation_results(arxiv_id)) if len(pdf_paths) == 0: continue for relative_pdf_path in pdf_paths: absolute_pdf_path = os.path.join( directories.compilation_results(arxiv_id), relative_pdf_path) if relative_pdf_path in boxes_and_hues: yield PdfAndBoxes( arxiv_id, relative_pdf_path, absolute_pdf_path, boxes_and_hues[relative_pdf_path], )
def load(self) -> Iterator[SearchTask]: for arxiv_id in get_arxiv_ids(self.get_diff_images_base_dir()): output_dir = get_data_subdirectory_for_arxiv_id( self.get_output_base_dir(), arxiv_id ) clean_directory(output_dir) # Get PDF names from results of compiling the uncolorized TeX sources. compiled_pdf_paths = get_compiled_pdfs( directories.compilation_results(arxiv_id) ) for iteration in get_iteration_names( self.get_diff_images_base_dir(), arxiv_id ): diff_images_dir = get_data_subdirectory_for_iteration( self.get_diff_images_base_dir(), arxiv_id, iteration ) hue_searches = self.load_hues(arxiv_id, iteration) hue_searches_by_pdf: Dict[PdfPath, List[HueSearchRegion]] = {} for search in hue_searches: pdfs_to_search = ( [search.relative_pdf_path] if search.relative_pdf_path is not None else compiled_pdf_paths ) for pdf_path in pdfs_to_search: if pdf_path not in hue_searches_by_pdf: hue_searches_by_pdf[pdf_path] = [] hue_searches_by_pdf[pdf_path].append(search) for relative_pdf_path, search_regions in hue_searches_by_pdf.items(): # PDF reads with PyPDF2 are costly, so do them all at once. pdf_page_dimensions: Dict[int, Dimensions] = {} absolute_pdf_path = os.path.join(directories.compilation_results(arxiv_id), relative_pdf_path) with open(absolute_pdf_path, "rb") as pdf_file: pdf = PdfFileReader(pdf_file) for page_number in range(pdf.getNumPages()): page = pdf.getPage(page_number) width = page.mediaBox.getWidth() height = page.mediaBox.getHeight() pdf_page_dimensions[page_number] = Dimensions(width, height) diff_images_pdf_path = os.path.join( diff_images_dir, relative_pdf_path ) page_images = {} for img_name in os.listdir(diff_images_pdf_path): img_path = os.path.join(diff_images_pdf_path, img_name) page_image = cv2.imread(img_path) page_number = int( os.path.splitext(img_name)[0].replace("page-", "") ) page_images[page_number] = page_image for search_region in search_regions: yield SearchTask( arxiv_id, iteration, page_images, pdf_page_dimensions, relative_pdf_path, search_region, )
def load(self) -> Iterator[ArxivId]: for arxiv_id in get_arxiv_ids(EQUATIONS_DIR): clean_directory(directories.symbols(arxiv_id)) yield arxiv_id