Example #1
0
def test_download_files_with_bad_urls():

    bad_urls = ["http://foobar"] * 6

    with TemporaryDirectory() as tmpdir, patch(
            "requests.Session.get") as mock_get:
        mock_get.return_value.ok = False

        with pytest.raises(RuntimeError):
            download_files(bad_urls, Path(tmpdir))
Example #2
0
def test_download_files(directory_of_mbox_files, dry_run):

    assert directory_of_mbox_files  # so that the files are already present

    # Try to re-download files already downloaded by the fixture
    url_template = (
        "https://mail-archives.apache.org/mod_mbox/httpd-users/20190{month}.mbox"
    )
    path = Path("/tmp/libratom/test_data/httpd-users")
    urls = [url_template.format(month=i) for i in range(1, 7)]
    download_files(urls, path, dry_run=dry_run)
def download_edrm_zipped_files() -> None:
    """Download edrm files into CACHED_ENRON_DATA_DIR."""

    names = [
        "albert_meyers",
        "andrea_ring",
        "andrew_lewis",
        "andy_zipper",
        "chris_dorland",
        "jason_wolfe",
        "vkaminski",
    ]

    # CSV files to download
    urls = [f"{ENRON_DATASET_URL}/{name}.zip" for name in names]

    download_files(urls, CACHED_ENRON_DATA_DIR, dry_run=False)
Example #4
0
def directory_of_mbox_files() -> Path:
    """
    Returns:
        A directory with multiple mbox files
    """

    url_template = (
        "https://mail-archives.apache.org/mod_mbox/httpd-users/20190{month}.mbox"
    )

    # path is our destination directory
    path = CACHED_HTTPD_USERS_MAIL_DIR

    # Download 6 monthly mailing list digests
    urls = [url_template.format(month=i) for i in range(1, 7)]
    download_files(urls, path)

    yield path
Example #5
0
def download_media_type_files(out) -> None:
    """Download media type files from https://www.iana.org/ and write a JSON file of all media types.
    """

    media_types = []

    media_type_registries = [
        "application",
        "audio",
        "font",
        "image",
        "message",
        "model",
        "multipart",
        "text",
        "video",
    ]

    # CSV files to download
    urls = [
        f"https://www.iana.org/assignments/media-types/{registry}.csv"
        for registry in media_type_registries
    ]

    with TemporaryDirectory() as tmpdir:
        directory = Path(tmpdir)
        download_files(urls, directory, dry_run=False)

        for file in directory.glob("*.csv"):
            with file.open(newline="") as csvfile:
                reader = csv.reader(csvfile)

                # Use the first token (Name) in each row, skip headers
                # The split is to strip DEPRECATED/OBSOLETED/... mentions appended to the name
                for [name, *_] in reader:
                    if name != "Name":
                        media_types.append(
                            f"{file.stem}/{name.split(maxsplit=1)[0]}")

    with out.open(mode="w") as f:
        json.dump(sorted(media_types), f, indent=4)