def _strip_arxiv_version(self, item: ArxivId) -> ArxivId: """Remove any version identifier from the passed ArXiv ID. Looks for the last 'v' (case-insensitive) and strips beyond it. Reference: https://arxiv.org/help/arxiv_identifier_for_services >>> c._strip_arxiv_version("1703.03400") 1703.03400 >>> c._strip_arxiv_version("1703.03400v0") 1703.03400 >>> c._strip_arxiv_version("1703.03400v3") 1703.03400 >>> c._strip_arxiv_version("1703.03400vasdfa") 1703.03400 >>> c._strip_arxiv_version("1703.03400asdfa") 1703.03400asdfa >>> c._strip_arxiv_version("1703.03400V0") 1703.03400 >>> c._strip_arxiv_version("1703.03400V0v1") 1703.03400V0 >>> c._strip_arxiv_version("math.GT/0309123") math.GT/0309123 >>> c._strip_arxiv_version("math.GT/0309123v1") math.GT/0309123 >>> c._strip_arxiv_version("math.VT/0309123") math.VT/030912 >>> c._strip_arxiv_version("math.VT/0309123v1") math.VT/03091233 """ # Don't strip from any field name values (e.g., math.VT above) start_index = max(0, item.find("/")) lower_v = item.rfind("v", start_index) upper_v = item.rfind("V", start_index) found_index = max(lower_v, upper_v) if found_index == -1: return item return item[0:found_index]
def fetch_from_s3(arxiv_id: ArxivId, bucket: str) -> None: logging.debug("Fetching sources for arXiv paper %s from s3 storage.", arxiv_id) arxiv_id_tokens = arxiv_id.split(".") year_month_match = re.match(r"\d{4}", arxiv_id_tokens[0]) if not year_month_match: logging.warning( # pylint: disable=logging-not-lazy ("Unexpected arXiv ID format %s; This method only works for fetching arXiv IDs" + " whose IDs start with YYMM. Skipping this paper."), arxiv_id, ) return year_month = year_month_match.group(0) s3_path = f"s3://{bucket}/bymonth/{year_month}" with TemporaryDirectory() as download_dir_path: command_args = [ "aws", "s3", "cp", s3_path, download_dir_path, # Each paper should have a file on S3---though the extension of that file is unknown. # For example, files could have, at least, a '.gz' or '.pdf' extension. To copy a file # with an unknown extension, copy recursively, including files that match the arXiv ID. "--recursive", "--exclude", "*", "--include", f"{arxiv_id}*", ] logging.debug("Fetching sources with command %s", command_args) result = subprocess.run( command_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False, ) logging.debug( "Finished running command to fetch S3 sources for arXiv ID %s", arxiv_id) if result.returncode != 0: logging.warning( "Error fetching files from S3 for arXiv ID %s: %s", arxiv_id, result.stderr, ) logging.debug( "Moving files downloaded to %s for arXiv ID %s to %s", download_dir_path, arxiv_id, directories.arxiv_subdir("sources-archives", arxiv_id), ) downloaded_files = os.listdir(download_dir_path) if len(downloaded_files) == 0: logging.warning( # pylint: disable=logging-not-lazy ("No files fetched for arXiv ID %s. It's possible there are no files for this " + "paper on S3."), arxiv_id, ) return if len(downloaded_files) > 1: logging.warning( "Unexpectedly downloaded more than one source archive file for arXiv ID %s", arxiv_id, ) downloaded_file_path = os.path.join(download_dir_path, downloaded_files[0]) save_path = directories.arxiv_subdir("sources-archives", arxiv_id) if not os.path.exists(os.path.dirname(save_path)): os.makedirs(os.path.dirname(save_path)) shutil.move(downloaded_file_path, save_path)