Ejemplo n.º 1
0
 def save(self, item: RasterTask, _: None) -> None:
     raster_pages(
         directories.arxiv_subdir("compiled-normalized-sources", item.arxiv_id),
         os.path.join(
             directories.arxiv_subdir("paper-images", item.arxiv_id),
             directories.escape_slashes(item.relative_output_file_path),
         ),
         item.relative_output_file_path,
         item.output_file_type,
     )
Ejemplo n.º 2
0
    def save(self, item: RasterTask, _: None) -> None:
        output_dir = os.path.join(
            directories.dirpath(self.get_output_base_dirkey()),
            item.compiled_tex_path,
            directories.escape_slashes(item.relative_output_file_path),
        )
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        config = configparser.ConfigParser()
        config.read(RASTER_CONFIG)

        raster_commands: Dict[str, str] = {}
        if "rasterers" in config:
            raster_commands = {k: v for (k, v) in config["rasterers"].items()}

        try:
            raster_command = raster_commands[item.output_file_type]
        except KeyError:
            logging.warning(  # pylint: disable=logging-not-lazy
                ("Could not find a rastering command for file %s in directory %s "
                 + "of type %s. This file will not be rastered."),
                item.relative_output_file_path,
                item.compiled_tex_path,
                item.output_file_type,
            )
            return

        args = ast.literal_eval(raster_command)
        args_resolved = [
            arg.format(output_dir=output_dir,
                       file=item.absolute_output_file_path) for arg in args
        ]

        result = subprocess.run(
            args_resolved,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=False,
        )
        if result.returncode == 0:
            logging.debug(
                "Successfully rastered pages for file %s using command %s",
                item.absolute_output_file_path,
                args_resolved,
            )
        else:
            logging.error(
                "Error rastering file %s using command %s: (Stdout: %s), (Stderr: %s)",
                item.absolute_output_file_path,
                args_resolved,
                result.stdout,
                result.stderr,
            )
Ejemplo n.º 3
0
    def process(self, item: LocationTask) -> Iterator[HueLocationInfo]:

        # Filter out entities that are empty (i.e., have nothing to color)
        # A '-1' in the 'start' or 'end' field indicates that the entity does not occur in a
        # specific place in the TeX, but rather a custom coloring technique based on other
        # entity properties will be used. So entities that have a '-1' for their start and
        # end should still be processed even though they appear to be zero-length.
        entities_filtered = [
            e for e in item.entities
            if e.start == -1 or e.end == -1 or e.start != e.end
        ]

        # Sort entities by the order in which they appear in the TeX. This allows the pipeline
        # to keep track of which ones appear first, when trying to recover from errors (i.e., when
        # trying to detect which entity in a batch may have shifted to cause many others to move.)
        entities_ordered = sorted(entities_filtered, key=lambda e: e.start)

        # Construct a queue of entities to detect.
        entities_by_id = {e.id_: e for e in entities_ordered}
        to_process = deque([e.id_ for e in entities_ordered])
        to_process_alone: Deque[str] = deque()

        # Path to output directories. These directories will be redefined once for each batch.
        colorized_tex_dir: Optional[str] = None
        compiled_tex_dir: Optional[str] = None
        raster_output_dir: Optional[str] = None
        diffs_output_dir: Optional[str] = None

        # Iteration state
        batch_index = -1
        iteration_id = None

        def next_batch() -> List[str]:
            """
            Get the next batch of entities to process. First tries to sample a batch from
            'to_process', and then attempts to sample individual entities from 'to_process_alone'.
            """
            if len(to_process) > 0:
                return [
                    to_process.popleft()
                    for _ in range(min(self.args.batch_size, len(to_process)))
                ]
            return [to_process_alone.popleft()]

        def _cleanup_from_last_batch() -> None:
            " Clean up output directories from the last batch. "
            if batch_index > -1 and not self.args.keep_intermediate_files:
                logging.debug(  # pylint: disable=logging-not-lazy
                    "Deleting intermediate files used to locate entities (i.e., colorized "
                    +
                    "sources, compilation results, and rasters) for paper %s iteration %s",
                    item.arxiv_id,
                    iteration_id or "''",
                )
                intermediate_files_dirs = [
                    colorized_tex_dir,
                    compiled_tex_dir,
                    raster_output_dir,
                    diffs_output_dir,
                ]
                for dir_ in intermediate_files_dirs:
                    if dir_ and os.path.exists(dir_):
                        file_utils.clean_directory(dir_)
                        os.rmdir(dir_)

        while len(to_process) > 0 or len(to_process_alone) > 0:
            if batch_index > -1:
                _cleanup_from_last_batch()

            batch_index += 1
            logging.debug(
                "Locating bounding boxes for batch %d-%d of entities of type %s for paper %s.",
                item.group,
                batch_index,
                self.get_entity_name(),
                item.arxiv_id,
            )
            iteration_id = directories.tex_iteration(
                item.tex_path, f"{item.group}-{batch_index}")

            # Define output directory locations for this batch.
            colorized_tex_dir = directories.iteration(
                self.output_base_dirs["sources"], item.arxiv_id, iteration_id)
            compiled_tex_dir = directories.iteration(
                self.output_base_dirs["compiled-sources"],
                item.arxiv_id,
                iteration_id,
            )
            raster_output_dir = directories.iteration(
                self.output_base_dirs["paper-images"], item.arxiv_id,
                iteration_id)
            diffs_output_dir = directories.iteration(
                self.output_base_dirs["diffed-images"], item.arxiv_id,
                iteration_id)

            # Fetch the next batch of entities to process.
            batch = next_batch()
            entities: List[SerializableEntity] = [
                entities_by_id[id_] for id_ in batch
            ]

            # Colorize the TeX for all the entities.
            custom_colorize_func = self.get_colorize_func()
            logging.debug(
                "Attempting to colorize entities in TeX for entity batch %d-%d of paper %s.",
                item.group,
                batch_index,
                item.arxiv_id,
            )
            if custom_colorize_func is not None:
                colorized_tex = custom_colorize_func(
                    item.file_contents.contents, entities,
                    self.get_colorize_options())
                if len(colorized_tex.entity_hues) == 0:
                    logging.info(  # pylint: disable=logging-not-lazy
                        "Custom colorization function colored nothing for entity batch %d-%d of "
                        +
                        "paper %s when coloring file %s. The function probably decide there was "
                        +
                        "nothing to do for this file, and will hopefullly colorize these "
                        +
                        "entities in another file. Skipping this batch for this file.",
                        item.group,
                        batch_index,
                        item.arxiv_id,
                        item.file_contents.path,
                    )
                    continue
            else:
                colorized_tex = colorize_entities(item.file_contents.contents,
                                                  entities,
                                                  self.get_colorize_options())

            # If some entities were skipped during colorization, perhaps because they
            # overlapped with each other, add them back to the work queue.
            if colorized_tex.skipped is not None and len(
                    colorized_tex.skipped) > 0:
                logging.info(  # pylint: disable=logging-not-lazy
                    "Entities %s were skipped during colorization batch %d-%d for paper "
                    + "%s. They will be processed in a later batch.",
                    [e.id_ for e in colorized_tex.skipped],
                    item.group,
                    batch_index,
                    item.arxiv_id,
                )
                # Queue skipped entities in the order that they initially appeared in the batch.
                reprocess_ids = {e.id_ for e in colorized_tex.skipped}
                reprocess_sorted = [
                    id_ for id_ in batch if id_ in reprocess_ids
                ]
                to_process.extendleft(reversed(reprocess_sorted))

                # Remove skipped entities from the current batch.
                for skip in colorized_tex.skipped:
                    del batch[batch.index(skip.id_)]

            # Save the colorized TeX to the file system.
            save_success = save_colorized_tex(
                item.arxiv_id,
                colorized_tex_dir,
                item.tex_path,
                iteration_id,
                colorized_tex.tex,
                item.file_contents.encoding,
                colorized_tex.entity_hues,
            )
            logging.debug(
                "Finished attempting to colorize entities for entity batch %d-%d of paper %s.",
                item.group,
                batch_index,
                item.arxiv_id,
            )
            if not save_success:
                logging.error(  # pylint: disable=logging-not-lazy
                    "Failed to save colorized TeX files for arXiv paper %s. "
                    "This paper will be skipped.",
                    item.arxiv_id,
                )

            # Compile the TeX with the colors.
            shutil.copytree(colorized_tex_dir, compiled_tex_dir)
            compilation_result = compile_tex(compiled_tex_dir)
            save_compilation_result("compiled-sources", item.arxiv_id,
                                    compiled_tex_dir, compilation_result)
            if not compilation_result.success:

                # If colorizing a specific entity caused the failure, remove the entity that caused
                # the problem from the batch and restart with a new batch, minus this entity.
                last_colorized_entity_id = get_last_colorized_entity(
                    item.arxiv_id, compiled_tex_dir)
                if last_colorized_entity_id is not None:
                    problem_ids = [last_colorized_entity_id]
                    if batch.index(last_colorized_entity_id) < len(batch) - 1:
                        problem_ids += [
                            batch[batch.index(last_colorized_entity_id) + 1]
                        ]

                    if len(batch) == 1:
                        logging.warning(  # pylint: disable=logging-not-lazy
                            "Failed to compile paper %s with colorized entity %s, even when it was "
                            +
                            "colorized in isolation. The location of this entity will not be detected.",
                            item.arxiv_id,
                            batch[0],
                        )
                        continue

                    logging.warning(  # pylint: disable=logging-not-lazy
                        "Failed to compile paper %s with colorized entities. The culprit may be "
                        +
                        "the colorization command for entity %s. The problematic entities will be "
                        +
                        "colorized on their own, and the rest of the entities will be colorized "
                        + "together in the next batch.",
                        item.arxiv_id,
                        " or ".join(problem_ids),
                    )

                    for id_ in problem_ids:
                        to_process_alone.append(id_)
                        del batch[batch.index(id_)]

                    to_process.extendleft(reversed(batch))
                    continue

                # If there was some other reason for the error, remove just the first entity from the batch.
                logging.error(  # pylint: disable=logging-not-lazy
                    "Failed to compile paper %s with colorized entities %s. The cause "
                    +
                    "is assumed to be in the first colorized entity. The location for the "
                    +
                    "first entity %s will not be detected. The remainder of the entities in "
                    + "this batch will be processed in another batch.",
                    item.arxiv_id,
                    batch,
                    batch[0],
                )
                del [batch[0]]
                to_process.extendleft(reversed(batch))
                continue

            # Raster the pages to images, and compute diffs from the original images.
            output_files = compilation_result.output_files
            for output_file in output_files:
                raster_success = raster_pages(
                    compiled_tex_dir,
                    os.path.join(raster_output_dir,
                                 directories.escape_slashes(output_file.path)),
                    output_file.path,
                    output_file.output_type,
                )
                if not raster_success:
                    logging.error(  # pylint: disable=logging-not-lazy
                        "Failed to rasterize pages for %s iteration %s. The locations for entities "
                        + "with IDs %s with not be detected.",
                        item.arxiv_id,
                        iteration_id,
                        batch,
                    )
                    continue

                logging.debug(
                    "Attempting to diff rastered pages for paper %s iteration %s.",
                    item.arxiv_id,
                    iteration_id,
                )
                diff_success = diff_images_in_raster_dirs(
                    output_files,
                    raster_output_dir,
                    diffs_output_dir,
                    item.arxiv_id,
                )
                logging.debug(
                    "Finished diffing attempt for paper %s iteration %s. Success? %s.",
                    item.arxiv_id,
                    iteration_id,
                    diff_success,
                )
                if not diff_success:
                    logging.error(  # pylint: disable=logging-not-lazy
                        "Failed to difference images of original and colorized versions of "
                        +
                        "papers %s in batch processing iteration %s. The locations for entities with IDs "
                        + "%s will not be detected.",
                        item.arxiv_id,
                        iteration_id,
                        batch,
                    )
                    continue

            # Locate the entities in the diffed images.
            logging.debug(
                "Attempting to locate entities using image differences for paper %s iteration %s.",
                item.arxiv_id,
                iteration_id,
            )
            entity_hues = colorized_tex.entity_hues
            location_result = locate_entities(item.arxiv_id, raster_output_dir,
                                              diffs_output_dir, entity_hues)
            if location_result is None:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Error occurred when locating entities by hue in diffed images "
                    +
                    "for paper %s. None of the entities in batch %s will be detected.",
                    item.arxiv_id,
                    batch,
                )
                continue

            if self.should_sanity_check_images(
            ) and location_result.black_pixels_found:
                logging.warning(  # pylint: disable=logging-not-lazy
                    "Ignoring bounding boxes found for paper %s in batch %s due to "
                    +
                    "black pixels found in the images. This might indicate that the colorization "
                    + "commands introduced subtle shifts of the text.",
                    item.arxiv_id,
                    batch,
                )
                continue

            # If colorizing entities seemed to cause drift in the document...
            if len(location_result.shifted_entities) > 0:

                logging.warning(  # pylint: disable=logging-not-lazy
                    "Some entities shifted position in the colorized TeX for paper %s batch %s: "
                    +
                    "%s. Attempting to remove the first shifted entity from the batch.",
                    item.arxiv_id,
                    batch,
                    location_result.shifted_entities,
                )

                first_shifted_entity_id = None
                for entity_id in batch:
                    if entity_id in location_result.shifted_entities:
                        first_shifted_entity_id = entity_id
                        break

                if first_shifted_entity_id is not None:
                    if len(batch) > 1:
                        logging.info(  # pylint: disable=logging-not-lazy
                            "Entity %s has been marked as being the potential cause of shifting in "
                            +
                            "the colorized document for paper %s batch %d-%d. It will be processed "
                            +
                            "later on its own. The other shifted entities in %s will be queued to "
                            + "process as a group in an upcoming batch.",
                            first_shifted_entity_id,
                            item.arxiv_id,
                            item.group,
                            batch_index,
                            location_result.shifted_entities,
                        )

                        # Get the index of the first entity for which the location has shifted
                        # during colorization.
                        moved_entity_index = batch.index(
                            first_shifted_entity_id)

                        # Mark all other entities that have shifted after the first one one to be processed
                        # in a later batch (instead of on their own). It could be that they won't shift
                        # once the first shifted entity is removed.
                        for i in range(len(batch) - 1, moved_entity_index, -1):
                            if batch[i] in location_result.shifted_entities:
                                to_process.appendleft(batch[i])
                                del batch[i]

                        # Mark the first entity that shifted to be reprocessed alone, where its position
                        # might be discoverable, without affecting the positions of other element.
                        del batch[moved_entity_index]
                        to_process_alone.append(first_shifted_entity_id)

                    elif len(batch) == 1 and self.should_sanity_check_images():
                        logging.info(  # pylint: disable=logging-not-lazy
                            "Skipping entity %s for paper %s as it caused " +
                            "colorization errors even when colorized in isolation.",
                            first_shifted_entity_id,
                            item.arxiv_id,
                        )
                        continue
                    elif len(batch) == 1:
                        logging.info(  # pylint: disable=logging-not-lazy
                            "Entity %s has been marked as the cause of shifting in "
                            +
                            "the colorized document for paper %s. Its location will "
                            +
                            "still be saved (if one was found), though this location should be "
                            + "considered potentially inaccurate.",
                            first_shifted_entity_id,
                            item.arxiv_id,
                        )

                else:
                    logging.warning(  # pylint: disable=logging-not-lazy
                        "Could not find a single entity that was likely responsible for shifting in "
                        +
                        "the colorized version of paper %s batch %d-%d. All entities in batch %s will "
                        + "be processed on their own.",
                        item.arxiv_id,
                        item.group,
                        batch_index,
                        batch,
                    )
                    to_process_alone.extend(batch)

            logging.debug(
                "Finished attempt at locating entities with image diffs for paper %s iteration %s.",
                item.arxiv_id,
                iteration_id,
            )

            # The code above is responsible for filtering 'batch' to ensure that it doesn't include
            # any entity IDs that shouldn't be saved to file, for example if the client has asked that
            # entity IDs that cause colorization errors be omitted from the results.
            for entity_id in batch:
                for box in location_result.locations[entity_id]:
                    yield HueLocationInfo(
                        tex_path=item.tex_path,
                        iteration=iteration_id,
                        hue=entity_hues[entity_id],
                        entity_id=entity_id,
                        page=box.page,
                        left=box.left,
                        top=box.top,
                        width=box.width,
                        height=box.height,
                    )

        _cleanup_from_last_batch()
Ejemplo n.º 4
0
        "--output-dir",
        help=
        ("Directory into which the arXiv sources will be fetched. The fetched sources will "
         +
         "be saved in a subfolder of the output folder with its name as the arXiv ID "
         + "(i.e., 'output_dir/<arxiv_id>/')."),
        default="tmp",
    )

    args = parser.parse_args()
    arxiv_id = args.arxiv_id

    output_dir = args.output_dir
    archives_dir = os.path.join(output_dir, "archives")
    archive_path = os.path.join(archives_dir,
                                directories.escape_slashes(arxiv_id))
    sources_dir = os.path.join(output_dir,
                               directories.escape_slashes(arxiv_id))

    if not os.path.exists(archives_dir):
        print(f"Creating directory to hold source archives at {archives_dir}.")
        os.makedirs(archives_dir)

    print(
        f"Downloading archive of source files from arXiv for paper {arxiv_id}...",
        end="",
    )
    fetch_from_arxiv(arxiv_id, dest=archive_path)
    print("done.")

    if not os.path.exists(sources_dir):
Ejemplo n.º 5
0
 def get_paper_dirs(self, arxiv_id: ArxivId) -> Iterator[RelativePath]:
     return iter([directories.escape_slashes(arxiv_id)])
Ejemplo n.º 6
0
    def save(self, item: ArxivId, _: None) -> None:

        upload_path = (
            f"s3://{self.args.s3_bucket}/{self.args.s3_prefix}/dump/by_arxiv_id/{item}"
        )
        command_args = [
            "aws",
            "s3",
            "sync",
            directories.DATA_DIR,
            upload_path,
            "--exclude",
            "*",
            # This is not a _perfect_ filter: if an arXiv paper has in its sources a directory
            # with the name of another arXiv ID, that directory will be copied when processing
            # the paper with that ID. This shouldn't come up often enough to be a concern.
            "--include",
            f"*/{directories.escape_slashes(item)}",
            "--include",
            f"*/{directories.escape_slashes(item)}/*",
        ]
        logging.debug("Uploading all output with command %s", command_args)
        command_result = subprocess.run(
            command_args,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=False,
        )
        logging.debug("Finished uploading all output for arXiv ID %s to S3",
                      item)
        if command_result.returncode != 0:
            logging.warning(
                "Error uploading all output to S3 for arXiv ID %s: %s",
                item,
                command_result.stderr,
            )

        with TemporaryDirectory() as staging_dir_path:
            for spec in RESULT_SPECS:
                glob_pattern = os.path.join(
                    directories.dirpath(spec.dirkey),
                    directories.escape_slashes(item),
                    spec.glob,
                )
                paths = glob.glob(glob_pattern)
                if len(paths) == 0:
                    logging.warning(  # pylint: disable=logging-not-lazy
                        ("Could not find any files matching %s for arXiv ID %s. There may have been errors "
                         +
                         "in processing the stage that would generate these files for this paper."
                         ),
                        glob_pattern,
                        item,
                    )

                result_dir = os.path.join(staging_dir_path, spec.name)
                os.makedirs(result_dir)

                for i, path in enumerate(paths):
                    __, ext = os.path.splitext(path)
                    filename = f"{item}-{i}{ext}"
                    dest_path = os.path.join(result_dir, filename)
                    logging.debug("Staging %s to temporary directory %s", path,
                                  dest_path)
                    shutil.copy(path, dest_path)

            upload_path = f"s3://{self.args.s3_bucket}/{self.args.s3_prefix}/results"
            command_args = ["aws", "s3", "sync", staging_dir_path, upload_path]
            logging.debug("Uploading results with command %s", command_args)
            command_result = subprocess.run(
                command_args,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                check=False,
            )
            logging.debug("Finished uploading results for arXiv ID %s to S3.",
                          item)
            if command_result.returncode != 0:
                logging.warning(
                    "Error uploading results to S3 for arXiv ID %s: %s",
                    item,
                    command_result.stderr,
                )