def test_download_image(self): reset_directory(IMAGE_DIRECTORY) url = URLS[0] download_image(url, IMAGE_DIRECTORY) file_name = escape_image_name(url) assert file_name in os.listdir(IMAGE_DIRECTORY) assert os.path.getsize(os.path.join(IMAGE_DIRECTORY, file_name)) > 0
def test_download_image_with_timeout_exceed_time(self): reset_directory(IMAGE_DIRECTORY) timeout = 0 url = URLS[0] successful = download_image_with_timeout(url, IMAGE_DIRECTORY, timeout) assert not successful file_name = escape_image_name(url) assert file_name not in os.listdir(IMAGE_DIRECTORY)
def test_download_image_with_timeout_within_time(self): reset_directory(IMAGE_DIRECTORY) timeout = 5 url = URLS[0] successful = download_image_with_timeout(url, IMAGE_DIRECTORY, timeout) assert successful file_name = escape_image_name(url) assert file_name in os.listdir(IMAGE_DIRECTORY) assert os.path.getsize(os.path.join(IMAGE_DIRECTORY, file_name)) > 0
def download_image(url: str, output_directory: str): """ Downloads image from given URL. Parameters: url (str): URL to try and download image from output_directory (str): output_directory to save image to """ image_name = escape_image_name(url) data = download_url(url) with open(os.path.join(output_directory, image_name), "wb") as output_file: output_file.write(data)
def test_multi_thread_image_download(self): reset_directory(IMAGE_DIRECTORY) total_images = multi_thread_image_download( URL_FILE, IMAGE_DIRECTORY, max_fetching_threads=2, download_timeout=5, verbose=False, ) assert total_images == len(URLS) for url in URLS: file_name = escape_image_name(url) assert file_name in os.listdir(IMAGE_DIRECTORY) assert os.path.getsize(os.path.join(IMAGE_DIRECTORY, file_name)) > 0
def test_multi_thread_image_download_list(self): reset_directory(IMAGE_DIRECTORY) total_images = multi_thread_image_download( [ "https://benaandrew.github.io/images/sentiment.jpg", "https://benaandrew.github.io/images/dog.jpg" ], IMAGE_DIRECTORY, max_fetching_threads=2, download_timeout=5, verbose=False, ) assert total_images == len(URLS) for url in URLS: file_name = escape_image_name(url) assert file_name in os.listdir(IMAGE_DIRECTORY) assert os.path.getsize(os.path.join(IMAGE_DIRECTORY, file_name)) > 0
def test_escape_image_name_https(self): url = "https://www.fakewebsite.com/sample.jpg" result = escape_image_name(url) assert result == "wwwfakewebsitecomsample.jpg"
def test_escape_image_name_non_alphanumeric_characters(self): url = "https://www.fakewebsite!.com/sample$_.jpg" result = escape_image_name(url) assert result == "wwwfakewebsitecomsample.jpg"
def multi_thread_image_download( urls: str, output_directory: str, max_fetching_threads=None, download_timeout=5, verbose=True, ): """ Downloads list of images using multiple threads. Parameters: url_file_path (list/str): list of URLs or path to text file containing list of URLs output_directory (str): destination directory path max_image_fetching_threads (int): maximum number of concurrent image download threads (default is cores * 5) image_download_timeout (int): maximum wait time in seconds for an image download (default is 5) verbose (bool): show tqdm progress bar (default true) Returns: int: total files in the directory """ # If urls is not a list is must be a path to a file with a list of urls if isinstance(urls, str): with open(urls, "r") as url_file: urls = set(url_file.read().splitlines()) if not os.path.isdir(output_directory): os.mkdir(output_directory) else: # Exclude existing images urls = [ url for url in urls if escape_image_name(url) not in os.listdir(output_directory) ] # Build concurrent thread pool with max_image_fetching_threads with ThreadPoolExecutor(max_fetching_threads) as pool: if download_timeout is not None: futures = [ pool.submit( download_image_with_timeout, url, output_directory, download_timeout, ) for url in urls ] else: futures = [ pool.submit( download_image, url, output_directory, ) for url in urls ] if verbose: for _ in tqdm(as_completed(futures)): pass else: wait(futures) return len(os.listdir(output_directory))