def test_download_multiprocess_chunks(self): # Tests that the three finish downloading but may finish in any order urls = [ 'https://parl.ai/downloads/mnist/mnist.tar.gz', 'https://parl.ai/downloads/mnist/mnist.tar.gz.BAD', 'https://parl.ai/downloads/mnist/mnist.tar.gz.BAD', ] with testing_utils.capture_output() as stdout: download_results = build_data.download_multiprocess( urls, self.datapath, dest_filenames=self.dest_filenames, chunk_size=1) stdout = stdout.getvalue() output_filenames, output_statuses, output_errors = zip( *download_results) self.assertIn('mnist0.tar.gz', output_filenames, f'missing file:\n{stdout}') self.assertIn('mnist1.tar.gz', output_filenames, f'missing file:\n{stdout}') self.assertIn('mnist2.tar.gz', output_filenames, f'missing file:\n{stdout}') self.assertIn(200, output_statuses, f'unexpected error code:\n{stdout}') self.assertIn(403, output_statuses, f'unexpected error code:\n{stdout}')
def download_images(opt, task='personality_captions'): dpath = os.path.join(opt['datapath'], task) image_path = os.path.join(opt['datapath'], 'yfcc_images') version = '1.0' response = input( 'Please confirm that you have obtained permission ' 'to work with the YFCC100m dataset, as outlined by the steps ' 'listed at ' 'https://multimediacommons.wordpress.com/yfcc100m-core-dataset/ [Y/y]: ' ) if response.lower() != 'y': raise RuntimeError( 'In order to use the images from this dataset, ' 'you must obtain permission from the webpage above.') response = input( 'NOTE: This script will download each image individually from the ' 's3 server on which the images are hosted. This will take a *very ' 'long* time. Are you sure you would like to continue? [Y/y]: ') if response.lower() != 'y': raise RuntimeError('If you have access to the images, please specify ' 'the path to the folder via the `--yfcc-path` ' 'command line argument.') image_prefix = 'https://multimedia-commons.s3-us-west-2.amazonaws.com/data/images' hashes = [] dts = ['train', 'val', 'test'] if task == 'image_chat': dts[1] = 'valid' for dt in dts: with open(os.path.join(dpath, '{}.json'.format(dt))) as f: data = json.load(f) hashes += [d['image_hash'] for d in data] os.makedirs(image_path, exist_ok=True) print('[downloading images to {}]'.format(image_path)) image_urls = [ f"{image_prefix}/{p_hash[:3]}/{p_hash[3:6]}/{p_hash}.jpg" for p_hash in hashes ] download_multiprocess(image_urls, image_path, dest_filenames=[f"{h}.jpg" for h in hashes]) build_data.mark_done(image_path, version)
def _download_images(self, opt: Opt): """ Download available IGC images. """ urls = [] ids = [] for dt in ['test', 'val']: df = os.path.join(self.get_data_path(opt), f'IGC_crowd_{dt}.csv') with open(df, newline='\n') as csv_file: reader = csv.reader(csv_file, delimiter=',') fields = [] for i, row in enumerate(reader): if i == 0: fields = row else: data = dict(zip(fields, row)) urls.append(data['url']) ids.append(data['id']) os.makedirs(self.get_image_path(opt), exist_ok=True) # Make one blank image image = Image.new('RGB', (100, 100), color=0) image.save(os.path.join(self.get_image_path(opt), self.blank_image_id), 'JPEG') # Download the rest download_multiprocess(urls, self.get_image_path(opt), dest_filenames=ids) # Remove bad images for fp in os.listdir(self.get_image_path(opt)): img_path = os.path.join(self.get_image_path(opt), fp) if os.path.isfile(img_path): try: Image.open(img_path).convert('RGB') except OSError: os.remove(img_path)
def test_download_multiprocess(self): urls = [ 'http://parl.ai/downloads/mnist/mnist.tar.gz', 'http://parl.ai/downloads/mnist/mnist.tar.gz.BAD', 'http://parl.ai/downloads/mnist/mnist.tar.gz.BAD', ] download_results = build_data.download_multiprocess( urls, self.datapath, dest_filenames=self.dest_filenames) output_filenames, output_statuses, output_errors = zip( *download_results) self.assertEqual(output_filenames, self.dest_filenames, 'output filenames not correct') self.assertEqual(output_statuses, (200, 403, 403), 'output http statuses not correct')
def test_download_multiprocess_chunks(self): # Tests that the three finish downloading but may finish in any order urls = [ 'https://parl.ai/downloads/mnist/mnist.tar.gz', 'https://parl.ai/downloads/mnist/mnist.tar.gz.BAD', 'https://parl.ai/downloads/mnist/mnist.tar.gz.BAD', ] download_results = build_data.download_multiprocess( urls, self.datapath, dest_filenames=self.dest_filenames, chunk_size=1 ) output_filenames, output_statuses, output_errors = zip(*download_results) self.assertIn('mnist0.tar.gz', output_filenames) self.assertIn('mnist1.tar.gz', output_filenames) self.assertIn('mnist2.tar.gz', output_filenames) self.assertIn(200, output_statuses, 'unexpected error code') self.assertIn(403, output_statuses, 'unexpected error code')
def test_download_multiprocess(self): urls = [ 'https://parl.ai/downloads/mnist/mnist.tar.gz', 'https://parl.ai/downloads/mnist/mnist.tar.gz.BAD', 'https://parl.ai/downloads/mnist/mnist.tar.gz.BAD', ] with testing_utils.capture_output() as stdout: download_results = build_data.download_multiprocess( urls, self.datapath, dest_filenames=self.dest_filenames) stdout = stdout.getvalue() output_filenames, output_statuses, output_errors = zip( *download_results) self.assertEqual( output_filenames, self.dest_filenames, f'output filenames not correct\n{stdout}', ) self.assertEqual( output_statuses, (200, 403, 403), f'output http statuses not correct\n{stdout}', )