def test_maybe_download_retry(caplog): caplog.clear() caplog.set_level(logging.INFO) with pytest.raises(requests.exceptions.HTTPError): maybe_download( "https://recodatasets.z20.web.core.windows.net/non_existing_file.zip" ) assert "Problem downloading" in caplog.text
def test_maybe_download_maybe(caplog, files_fixtures): caplog.clear() caplog.set_level(logging.INFO) file_url, filepath = files_fixtures if os.path.exists(filepath): os.remove(filepath) downloaded_filepath = maybe_download(file_url, "license.txt") assert os.path.exists(downloaded_filepath) maybe_download(file_url, "license.txt") assert "File ./license.txt already downloaded" in caplog.text
def download_movielens(size, dest_path): """Downloads MovieLens datafile. Args: size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m"). dest_path (str): File path for the downloaded file """ if size not in DATA_FORMAT: raise ValueError(ERROR_MOVIE_LENS_SIZE) url = "http://files.grouplens.org/datasets/movielens/ml-" + size + ".zip" dirs, file = os.path.split(dest_path) maybe_download(url, file, work_directory=dirs)
def _download_reviews(name, dest_path): """Downloads Amazon reviews datafile. Args: name (str): Category of reviews dest_path (str): File path for the downloaded file """ url = ("http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/" + name + ".gz") dirs, file = os.path.split(dest_path) maybe_download(url, file + ".gz", work_directory=dirs)
def download_deeprec_resources(azure_container_url, data_path, remote_resource_name): """Download resources. Args: azure_container_url (str): URL of Azure container. data_path (str): Path to download the resources. remote_resource_name (str): Name of the resource. """ os.makedirs(data_path, exist_ok=True) remote_path = azure_container_url + remote_resource_name maybe_download(remote_path, remote_resource_name, data_path) zip_ref = zipfile.ZipFile(os.path.join(data_path, remote_resource_name), "r") zip_ref.extractall(data_path) zip_ref.close() os.remove(os.path.join(data_path, remote_resource_name))
def download_mind(size="small", dest_path=None): """Download MIND dataset Args: size (str): Dataset size. One of ["small", "large"] dest_path (str): Download path. If path is None, it will download the dataset on a temporal path Returns: str, str: Path to train and validation sets. """ size_options = ["small", "large", "demo"] if size not in size_options: raise ValueError(f"Wrong size option, available options are {size_options}") url_train, url_valid = URL_MIND[size] with download_path(dest_path) as path: train_path = maybe_download(url=url_train, work_directory=path) valid_path = maybe_download(url=url_valid, work_directory=path) return train_path, valid_path
def test_maybe_download(files_fixtures): file_url, filepath = files_fixtures if os.path.exists(filepath): os.remove(filepath) downloaded_filepath = maybe_download(file_url, "license.txt", expected_bytes=1162) assert os.path.exists(downloaded_filepath) assert downloaded_filepath.split("/")[-1] == "license.txt"
def test_maybe_download_wrong_bytes(caplog, files_fixtures): caplog.clear() caplog.set_level(logging.INFO) file_url, filepath = files_fixtures if os.path.exists(filepath): os.remove(filepath) with pytest.raises(IOError): filepath = maybe_download(file_url, "license.txt", expected_bytes=0) assert "Failed to verify license.txt" in caplog.text
def download_criteo(size="sample", work_directory="."): """Download criteo dataset as a compressed file. Args: size (str): Size of criteo dataset. It can be "full" or "sample". work_directory (str): Working directory. Returns: str: Path of the downloaded file. """ url = CRITEO_URL[size] return maybe_download(url, work_directory=work_directory)
def download_and_extract_glove(dest_path): """Download and extract the Glove embedding Args: dest_path (str): Destination directory path for the downloaded file Returns: str: File path where Glove was extracted. """ url = "http://nlp.stanford.edu/data/glove.6B.zip" filepath = maybe_download(url=url, work_directory=dest_path) glove_path = os.path.join(dest_path, "glove") unzip_file(filepath, glove_path, clean_zip_file=False) return glove_path