def load(self) -> Iterator[TexAndTokens]:
        for arxiv_id in get_arxiv_ids(directories.SOURCES_DIR):

            output_root = get_data_subdirectory_for_arxiv_id(
                directories.SOURCES_WITH_COLORIZED_EQUATION_TOKENS_DIR,
                arxiv_id)
            clean_directory(output_root)

            tokens_path = os.path.join(directories.symbols(arxiv_id),
                                       "tokens.csv")
            if not os.path.exists(tokens_path):
                logging.info(
                    "No equation token data found for paper %s. Skipping.",
                    arxiv_id)
                continue

            # Load token location information
            tokens = load_tokens(arxiv_id)
            if tokens is None:
                continue
            tex_paths = set({token.tex_path for token in tokens})

            # Load original sources for TeX files that need to be colorized
            contents_by_file = {}
            for tex_path in tex_paths:
                absolute_tex_path = os.path.join(directories.sources(arxiv_id),
                                                 tex_path)
                contents = read_file_tolerant(absolute_tex_path)
                if contents is not None:
                    contents_by_file[tex_path] = contents

            yield TexAndTokens(arxiv_id, contents_by_file, tokens)
    def load(self) -> Iterator[SymbolData]:
        for arxiv_id in get_arxiv_ids(SOURCES_DIR):

            s2_id = get_s2_id(arxiv_id)
            if s2_id is None:
                continue

            symbols_with_ids = load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue

            boxes: Dict[SymbolId, PdfBoundingBox] = {}
            boxes_path = os.path.join(
                directories.symbol_locations(arxiv_id), "symbol_locations.csv"
            )
            if not os.path.exists(boxes_path):
                logging.warning(
                    "Could not find bounding boxes information for %s. Skipping",
                    arxiv_id,
                )
                continue
            with open(boxes_path) as boxes_file:
                reader = csv.reader(boxes_file)
                for row in reader:
                    symbol_id = SymbolId(
                        tex_path=row[0],
                        equation_index=int(row[1]),
                        symbol_index=int(row[2]),
                    )
                    box = PdfBoundingBox(
                        page=int(row[3]),
                        left=float(row[4]),
                        top=float(row[5]),
                        width=float(row[6]),
                        height=float(row[7]),
                    )
                    boxes[symbol_id] = box

            matches: Matches = {}
            matches_path = os.path.join(
                directories.symbol_matches(arxiv_id), "matches.csv"
            )
            if not os.path.exists(matches_path):
                logging.warning(
                    "Could not find symbol matches information for %s. Skipping",
                    arxiv_id,
                )
                continue
            with open(matches_path) as matches_file:
                reader = csv.reader(matches_file)
                for row in reader:
                    mathml = row[0]
                    match_mathml = row[1]
                    rank = int(row[2])
                    if mathml not in matches:
                        matches[mathml] = []
                    matches[mathml].append(Match(match_mathml, rank))

            yield SymbolData(arxiv_id, s2_id, symbols_with_ids, boxes, matches)
Beispiel #3
0
 def load(self) -> Iterator[FileContents]:
     for arxiv_id in get_arxiv_ids(SOURCES_DIR):
         sources_dir = sources(arxiv_id)
         clean_directory(directories.bibitems(arxiv_id))
         for path in find_files(sources_dir, [".tex", ".bbl"]):
             contents = read_file_tolerant(path)
             if contents is None:
                 continue
             yield FileContents(arxiv_id, path, contents)
    def load(self) -> Iterator[FileContents]:
        for arxiv_id in get_arxiv_ids(directories.SOURCES_DIR):

            output_root = get_data_subdirectory_for_arxiv_id(
                directories.SOURCES_WITH_COLORIZED_CITATIONS_DIR, arxiv_id)
            clean_directory(output_root)

            original_sources_path = directories.sources(arxiv_id)
            for text_path in find_files(original_sources_path, [".tex"],
                                        relative=True):
                contents = read_file_tolerant(
                    os.path.join(original_sources_path, text_path))
                if contents is not None:
                    yield FileContents(arxiv_id, text_path, contents)
    def load(self) -> Iterator[MathMLForPaper]:

        for arxiv_id in get_arxiv_ids(directories.SYMBOLS_DIR):

            output_dir = directories.symbol_matches(arxiv_id)
            clean_directory(output_dir)

            symbols_with_ids = load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue

            symbols_mathml = {swi.symbol.mathml for swi in symbols_with_ids}

            yield MathMLForPaper(arxiv_id=arxiv_id,
                                 mathml_equations=symbols_mathml)
    def load(self) -> Iterator[LocationTask]:

        for arxiv_id in get_arxiv_ids(
                directories.HUE_LOCATIONS_FOR_EQUATION_TOKENS_DIR):

            output_dir = directories.symbol_locations(arxiv_id)
            clean_directory(output_dir)

            token_locations: Dict[CharacterId, List[PdfBoundingBox]] = {}
            token_locations_path = os.path.join(
                directories.hue_locations_for_equation_tokens(arxiv_id),
                "hue_locations.csv",
            )
            if not os.path.exists(token_locations_path):
                logging.warning(
                    "Could not find bounding boxes information for %s. Skipping",
                    arxiv_id,
                )
                continue
            with open(token_locations_path) as token_locations_file:
                reader = csv.reader(token_locations_file)
                for row in reader:
                    tex_path = row[-3]
                    equation_index = int(row[-2])
                    character_index = int(row[-1])
                    character_id = CharacterId(tex_path, equation_index,
                                               character_index)
                    box = PdfBoundingBox(
                        page=int(row[3]),
                        left=float(row[4]),
                        top=float(row[5]),
                        width=float(row[6]),
                        height=float(row[7]),
                    )
                    if character_id not in token_locations:
                        token_locations[character_id] = []
                    token_locations[character_id].append(box)

            symbols_with_ids = load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue

            for symbol_with_id in symbols_with_ids:
                yield LocationTask(
                    arxiv_id=arxiv_id,
                    character_locations=token_locations,
                    symbol_with_id=symbol_with_id,
                )
Beispiel #7
0
    def load(self) -> Iterator[PageRasterPair]:
        for arxiv_id in get_arxiv_ids(directories.PAPER_IMAGES_DIR):
            output_dir = get_data_subdirectory_for_arxiv_id(
                self.get_output_base_dir(), arxiv_id
            )
            clean_directory(output_dir)

            # Get PDF names from results of compiling the uncolorized TeX sources.
            pdf_paths = get_compiled_pdfs(directories.compilation_results(arxiv_id))
            if len(pdf_paths) == 0:
                continue

            for iteration in get_iteration_names(self.get_raster_base_dir(), arxiv_id):

                original_images_dir = directories.paper_images(arxiv_id)
                modified_images_dir = get_data_subdirectory_for_iteration(
                    self.get_raster_base_dir(), arxiv_id, iteration
                )

                for relative_pdf_path in pdf_paths:
                    original_pdf_images_path = os.path.join(
                        original_images_dir, relative_pdf_path
                    )
                    for img_name in os.listdir(original_pdf_images_path):
                        original_img_path = os.path.join(
                            original_pdf_images_path, img_name
                        )
                        modified_img_path = os.path.join(
                            modified_images_dir, relative_pdf_path, img_name
                        )
                        if not os.path.exists(modified_img_path):
                            logging.warning(
                                "Could not find expected image %s. Skipping diff for this paper.",
                                modified_img_path,
                            )
                            break

                        original_img = cv2.imread(original_img_path)
                        modified_img = cv2.imread(modified_img_path)
                        yield PageRasterPair(
                            arxiv_id,
                            iteration,
                            relative_pdf_path,
                            img_name,
                            original_img,
                            modified_img,
                        )
    def load(self) -> Iterator[RasterTask]:

        papers_base_dir = self.get_papers_base_dir()
        for arxiv_id in get_arxiv_ids(papers_base_dir):

            # Clean all past output for this arXiv ID.
            output_dir_for_arxiv_id = directories.get_data_subdirectory_for_arxiv_id(
                self.get_output_base_dir(), arxiv_id)
            clean_directory(output_dir_for_arxiv_id)

            for paper_dir in self.get_paper_dirs(arxiv_id):
                paper_abs_path = os.path.join(self.get_papers_base_dir(),
                                              paper_dir)
                pdf_paths = get_compiled_pdfs(paper_abs_path)
                for path in pdf_paths:
                    yield RasterTask(paper_dir, path,
                                     os.path.join(paper_abs_path, path))
Beispiel #9
0
    def load(self) -> Iterator[EquationInfo]:

        colorized_equations_base_dir = directories.SOURCES_WITH_COLORIZED_EQUATIONS_DIR
        for arxiv_id in get_arxiv_ids(colorized_equations_base_dir):
            clean_directory(directories.equations(arxiv_id))

            for iteration in get_iteration_names(colorized_equations_base_dir,
                                                 arxiv_id):
                colorized_sources_dir = get_data_subdirectory_for_iteration(
                    colorized_equations_base_dir, arxiv_id, iteration)
                equation_hues_path = os.path.join(colorized_sources_dir,
                                                  "equation_hues.csv")
                with open(equation_hues_path) as equation_hues_file:
                    reader = csv.reader(equation_hues_file)
                    for row in reader:
                        yield EquationInfo(arxiv_id, row[0], int(row[1]),
                                           row[4])
    def load(self) -> Iterator[TexAndSymbols]:
        for arxiv_id in get_arxiv_ids(directories.SOURCES_DIR):

            output_root = get_data_subdirectory_for_arxiv_id(
                directories.SOURCES_WITH_ANNOTATED_SYMBOLS, arxiv_id)
            clean_directory(output_root)

            symbols_dir = directories.symbols(arxiv_id)
            tokens_path = os.path.join(symbols_dir, "tokens.csv")
            if not os.path.exists(tokens_path):
                logging.info(
                    "No equation token data found for paper %s. Skipping.",
                    arxiv_id)
                continue

            symbols_with_ids = load_symbols(arxiv_id)
            if symbols_with_ids is None:
                continue
            symbols = {swi.symbol_id: swi.symbol for swi in symbols_with_ids}

            tokens = load_tokens(arxiv_id)
            if tokens is None:
                continue
            tex_paths = set({t.tex_path for t in tokens})

            characters: Dict[CharacterId, Character] = {}
            for token in tokens:
                character_id = CharacterId(token.tex_path,
                                           token.equation_index,
                                           token.token_index)
                characters[character_id] = Character(token.text,
                                                     token.token_index,
                                                     token.start, token.end)

            # Load original sources for TeX files that need to be colorized
            contents_by_file = {}
            for tex_path in tex_paths:
                absolute_tex_path = os.path.join(directories.sources(arxiv_id),
                                                 tex_path)
                contents = read_file_tolerant(absolute_tex_path)
                if contents is not None:
                    contents_by_file[tex_path] = contents

            yield TexAndSymbols(arxiv_id, contents_by_file, symbols,
                                characters)
    def load(self) -> Iterator[PdfAndBoxes]:
        for arxiv_id in get_arxiv_ids(directories.COMPILED_SOURCES_DIR):
            output_dir = get_data_subdirectory_for_arxiv_id(
                self.get_output_base_dir(), arxiv_id)
            clean_directory(output_dir)

            boxes_and_hues = self.load_bounding_boxes(arxiv_id)

            pdf_paths = get_compiled_pdfs(
                directories.compilation_results(arxiv_id))
            if len(pdf_paths) == 0:
                continue

            for relative_pdf_path in pdf_paths:
                absolute_pdf_path = os.path.join(
                    directories.compilation_results(arxiv_id),
                    relative_pdf_path)
                if relative_pdf_path in boxes_and_hues:
                    yield PdfAndBoxes(
                        arxiv_id,
                        relative_pdf_path,
                        absolute_pdf_path,
                        boxes_and_hues[relative_pdf_path],
                    )
    def load(self) -> Iterator[SearchTask]:
        for arxiv_id in get_arxiv_ids(self.get_diff_images_base_dir()):
            output_dir = get_data_subdirectory_for_arxiv_id(
                self.get_output_base_dir(), arxiv_id
            )
            clean_directory(output_dir)

            # Get PDF names from results of compiling the uncolorized TeX sources.
            compiled_pdf_paths = get_compiled_pdfs(
                directories.compilation_results(arxiv_id)
            )

            for iteration in get_iteration_names(
                self.get_diff_images_base_dir(), arxiv_id
            ):

                diff_images_dir = get_data_subdirectory_for_iteration(
                    self.get_diff_images_base_dir(), arxiv_id, iteration
                )

                hue_searches = self.load_hues(arxiv_id, iteration)
                hue_searches_by_pdf: Dict[PdfPath, List[HueSearchRegion]] = {}
                for search in hue_searches:
                    pdfs_to_search = (
                        [search.relative_pdf_path]
                        if search.relative_pdf_path is not None
                        else compiled_pdf_paths
                    )
                    for pdf_path in pdfs_to_search:
                        if pdf_path not in hue_searches_by_pdf:
                            hue_searches_by_pdf[pdf_path] = []
                        hue_searches_by_pdf[pdf_path].append(search)

                for relative_pdf_path, search_regions in hue_searches_by_pdf.items():

                    # PDF reads with PyPDF2 are costly, so do them all at once.
                    pdf_page_dimensions: Dict[int, Dimensions] = {}
                    absolute_pdf_path = os.path.join(directories.compilation_results(arxiv_id), relative_pdf_path)
                    with open(absolute_pdf_path, "rb") as pdf_file:
                        pdf = PdfFileReader(pdf_file)
                        for page_number in range(pdf.getNumPages()):
                            page = pdf.getPage(page_number)
                            width = page.mediaBox.getWidth()
                            height = page.mediaBox.getHeight()
                            pdf_page_dimensions[page_number] = Dimensions(width, height)

                    diff_images_pdf_path = os.path.join(
                        diff_images_dir, relative_pdf_path
                    )
                    page_images = {}

                    for img_name in os.listdir(diff_images_pdf_path):
                        img_path = os.path.join(diff_images_pdf_path, img_name)
                        page_image = cv2.imread(img_path)
                        page_number = int(
                            os.path.splitext(img_name)[0].replace("page-", "")
                        )
                        page_images[page_number] = page_image

                    for search_region in search_regions:
                        yield SearchTask(
                            arxiv_id,
                            iteration,
                            page_images,
                            pdf_page_dimensions,
                            relative_pdf_path,
                            search_region,
                        )
 def load(self) -> Iterator[ArxivId]:
     for arxiv_id in get_arxiv_ids(EQUATIONS_DIR):
         clean_directory(directories.symbols(arxiv_id))
         yield arxiv_id
 def load(self) -> Iterator[ArxivId]:
     for arxiv_id in get_arxiv_ids(SOURCE_ARCHIVES_DIR):
         yield arxiv_id
    def load(self) -> Iterator[CitationData]:
        for arxiv_id in get_arxiv_ids(SOURCES_DIR):

            boxes_by_hue_iteration: Dict[HueIteration,
                                         List[PdfBoundingBox]] = {}
            bounding_boxes_path = os.path.join(
                directories.hue_locations_for_citations(arxiv_id),
                "hue_locations.csv")
            if not os.path.exists(bounding_boxes_path):
                logging.warning(
                    "Could not find bounding boxes information for %s. Skipping",
                    arxiv_id,
                )
                continue
            with open(bounding_boxes_path) as bounding_boxes_file:
                reader = csv.reader(bounding_boxes_file)
                for row in reader:
                    iteration = row[1]
                    hue = float(row[2])
                    box = PdfBoundingBox(
                        page=int(row[3]),
                        left=float(row[4]),
                        top=float(row[5]),
                        width=float(row[6]),
                        height=float(row[7]),
                    )
                    hue_iteration = HueIteration(hue, iteration)
                    if hue not in boxes_by_hue_iteration:
                        boxes_by_hue_iteration[hue_iteration] = []
                    boxes_by_hue_iteration[hue_iteration].append(box)

            citations_by_hue_iteration: Dict[HueIteration, CitationKeys] = {}
            for iteration in get_iteration_names(
                    directories.SOURCES_WITH_COLORIZED_CITATIONS_DIR,
                    arxiv_id):
                citation_hues_path = os.path.join(
                    get_data_subdirectory_for_iteration(
                        directories.SOURCES_WITH_COLORIZED_CITATIONS_DIR,
                        arxiv_id,
                        iteration,
                    ),
                    "citation_hues.csv",
                )
                if not os.path.exists(citation_hues_path):
                    logging.warning(
                        "Could not find citation hue colors for %s iteration %s. Skipping",
                        arxiv_id,
                        iteration,
                    )
                    continue
                with open(citation_hues_path) as citation_hues_file:
                    reader = csv.reader(citation_hues_file)
                    for row in reader:
                        citation_keys = cast(Tuple[str],
                                             tuple(json.loads(row[3])))
                        hue = float(row[2])
                        iteration = row[1]
                        hue_iteration = HueIteration(hue, iteration)
                        citations_by_hue_iteration[
                            hue_iteration] = citation_keys

            key_s2_ids: Dict[CitationKey, S2Id] = {}
            key_resolutions_path = os.path.join(
                directories.bibitem_resolutions(arxiv_id), "resolutions.csv")
            if not os.path.exists(key_resolutions_path):
                logging.warning(
                    "Could not find citation resolutions for %s. Skipping",
                    arxiv_id)
                continue
            with open(key_resolutions_path) as key_resolutions_file:
                reader = csv.reader(key_resolutions_file)
                for row in reader:
                    key = row[0]
                    s2_id = row[1]
                    key_s2_ids[key] = s2_id

            s2_id_path = os.path.join(directories.s2_metadata(arxiv_id),
                                      "s2_id")
            if not os.path.exists(s2_id_path):
                logging.warning("Could not find S2 ID file for %s. Skipping",
                                arxiv_id)
                continue
            with open(s2_id_path) as s2_id_file:
                s2_id = s2_id_file.read()

            s2_data: Dict[S2Id, Reference] = {}
            s2_metadata_path = os.path.join(directories.s2_metadata(arxiv_id),
                                            "references.csv")
            if not os.path.exists(s2_metadata_path):
                logging.warning(
                    "Could not find S2 metadata file for citations for %s. Skipping",
                    arxiv_id,
                )
                continue
            with open(s2_metadata_path) as s2_metadata_file:
                reader = csv.reader(s2_metadata_file)
                for row in reader:
                    s2_data[row[0]] = Reference(
                        s2Id=row[0],
                        arxivId=row[1] if row[1] is not "" else None,
                        doi=row[2] if row[2] is not "" else None,
                        title=row[3],
                        authors=[
                            Author(id=None, name=nm)
                            for nm in row[4].split(",")
                        ],
                        venue=row[5],
                        year=int(row[6]) if row[6] is not "" else None,
                    )

            yield CitationData(
                arxiv_id,
                s2_id,
                boxes_by_hue_iteration,
                citations_by_hue_iteration,
                key_s2_ids,
                s2_data,
            )