def test_download_files_with_bad_urls(): bad_urls = ["http://foobar"] * 6 with TemporaryDirectory() as tmpdir, patch( "requests.Session.get") as mock_get: mock_get.return_value.ok = False with pytest.raises(RuntimeError): download_files(bad_urls, Path(tmpdir))
def test_download_files(directory_of_mbox_files, dry_run): assert directory_of_mbox_files # so that the files are already present # Try to re-download files already downloaded by the fixture url_template = ( "https://mail-archives.apache.org/mod_mbox/httpd-users/20190{month}.mbox" ) path = Path("/tmp/libratom/test_data/httpd-users") urls = [url_template.format(month=i) for i in range(1, 7)] download_files(urls, path, dry_run=dry_run)
def download_edrm_zipped_files() -> None: """Download edrm files into CACHED_ENRON_DATA_DIR.""" names = [ "albert_meyers", "andrea_ring", "andrew_lewis", "andy_zipper", "chris_dorland", "jason_wolfe", "vkaminski", ] # CSV files to download urls = [f"{ENRON_DATASET_URL}/{name}.zip" for name in names] download_files(urls, CACHED_ENRON_DATA_DIR, dry_run=False)
def directory_of_mbox_files() -> Path: """ Returns: A directory with multiple mbox files """ url_template = ( "https://mail-archives.apache.org/mod_mbox/httpd-users/20190{month}.mbox" ) # path is our destination directory path = CACHED_HTTPD_USERS_MAIL_DIR # Download 6 monthly mailing list digests urls = [url_template.format(month=i) for i in range(1, 7)] download_files(urls, path) yield path
def download_media_type_files(out) -> None: """Download media type files from https://www.iana.org/ and write a JSON file of all media types. """ media_types = [] media_type_registries = [ "application", "audio", "font", "image", "message", "model", "multipart", "text", "video", ] # CSV files to download urls = [ f"https://www.iana.org/assignments/media-types/{registry}.csv" for registry in media_type_registries ] with TemporaryDirectory() as tmpdir: directory = Path(tmpdir) download_files(urls, directory, dry_run=False) for file in directory.glob("*.csv"): with file.open(newline="") as csvfile: reader = csv.reader(csvfile) # Use the first token (Name) in each row, skip headers # The split is to strip DEPRECATED/OBSOLETED/... mentions appended to the name for [name, *_] in reader: if name != "Name": media_types.append( f"{file.stem}/{name.split(maxsplit=1)[0]}") with out.open(mode="w") as f: json.dump(sorted(media_types), f, indent=4)