Exemple #1
0
    def _strip_arxiv_version(self, item: ArxivId) -> ArxivId:
        """Remove any version identifier from the passed ArXiv ID.
        Looks for the last 'v' (case-insensitive) and strips beyond it.
        Reference: https://arxiv.org/help/arxiv_identifier_for_services

        >>> c._strip_arxiv_version("1703.03400")
        1703.03400
        >>> c._strip_arxiv_version("1703.03400v0")
        1703.03400
        >>> c._strip_arxiv_version("1703.03400v3")
        1703.03400
        >>> c._strip_arxiv_version("1703.03400vasdfa")
        1703.03400
        >>> c._strip_arxiv_version("1703.03400asdfa")
        1703.03400asdfa
        >>> c._strip_arxiv_version("1703.03400V0")
        1703.03400
        >>> c._strip_arxiv_version("1703.03400V0v1")
        1703.03400V0
        >>> c._strip_arxiv_version("math.GT/0309123")
        math.GT/0309123
        >>> c._strip_arxiv_version("math.GT/0309123v1")
        math.GT/0309123
        >>> c._strip_arxiv_version("math.VT/0309123")
        math.VT/030912
        >>> c._strip_arxiv_version("math.VT/0309123v1")
        math.VT/03091233
        """

        # Don't strip from any field name values (e.g., math.VT above)
        start_index = max(0, item.find("/"))

        lower_v = item.rfind("v", start_index)
        upper_v = item.rfind("V", start_index)

        found_index = max(lower_v, upper_v)

        if found_index == -1:
            return item

        return item[0:found_index]
Exemple #2
0
def fetch_from_s3(arxiv_id: ArxivId, bucket: str) -> None:
    logging.debug("Fetching sources for arXiv paper %s from s3 storage.",
                  arxiv_id)
    arxiv_id_tokens = arxiv_id.split(".")
    year_month_match = re.match(r"\d{4}", arxiv_id_tokens[0])
    if not year_month_match:
        logging.warning(  # pylint: disable=logging-not-lazy
            ("Unexpected arXiv ID format %s; This method only works for fetching arXiv IDs"
             + " whose IDs start with YYMM. Skipping this paper."),
            arxiv_id,
        )
        return

    year_month = year_month_match.group(0)
    s3_path = f"s3://{bucket}/bymonth/{year_month}"
    with TemporaryDirectory() as download_dir_path:
        command_args = [
            "aws",
            "s3",
            "cp",
            s3_path,
            download_dir_path,
            # Each paper should have a file on S3---though the extension of that file is unknown.
            # For example, files could have, at least, a '.gz' or '.pdf' extension. To copy a file
            # with an unknown extension, copy recursively, including files that match the arXiv ID.
            "--recursive",
            "--exclude",
            "*",
            "--include",
            f"{arxiv_id}*",
        ]
        logging.debug("Fetching sources with command %s", command_args)
        result = subprocess.run(
            command_args,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            check=False,
        )
        logging.debug(
            "Finished running command to fetch S3 sources for arXiv ID %s",
            arxiv_id)
        if result.returncode != 0:
            logging.warning(
                "Error fetching files from S3 for arXiv ID %s: %s",
                arxiv_id,
                result.stderr,
            )

        logging.debug(
            "Moving files downloaded to %s for arXiv ID %s to %s",
            download_dir_path,
            arxiv_id,
            directories.arxiv_subdir("sources-archives", arxiv_id),
        )
        downloaded_files = os.listdir(download_dir_path)
        if len(downloaded_files) == 0:
            logging.warning(  # pylint: disable=logging-not-lazy
                ("No files fetched for arXiv ID %s. It's possible there are no files for this "
                 + "paper on S3."),
                arxiv_id,
            )
            return
        if len(downloaded_files) > 1:
            logging.warning(
                "Unexpectedly downloaded more than one source archive file for arXiv ID %s",
                arxiv_id,
            )
        downloaded_file_path = os.path.join(download_dir_path,
                                            downloaded_files[0])
        save_path = directories.arxiv_subdir("sources-archives", arxiv_id)
        if not os.path.exists(os.path.dirname(save_path)):
            os.makedirs(os.path.dirname(save_path))
        shutil.move(downloaded_file_path, save_path)