Exemple #1
0
    def test_download_multiprocess_chunks(self):
        # Tests that the three finish downloading but may finish in any order
        urls = [
            'https://parl.ai/downloads/mnist/mnist.tar.gz',
            'https://parl.ai/downloads/mnist/mnist.tar.gz.BAD',
            'https://parl.ai/downloads/mnist/mnist.tar.gz.BAD',
        ]

        with testing_utils.capture_output() as stdout:
            download_results = build_data.download_multiprocess(
                urls,
                self.datapath,
                dest_filenames=self.dest_filenames,
                chunk_size=1)
        stdout = stdout.getvalue()

        output_filenames, output_statuses, output_errors = zip(
            *download_results)

        self.assertIn('mnist0.tar.gz', output_filenames,
                      f'missing file:\n{stdout}')
        self.assertIn('mnist1.tar.gz', output_filenames,
                      f'missing file:\n{stdout}')
        self.assertIn('mnist2.tar.gz', output_filenames,
                      f'missing file:\n{stdout}')
        self.assertIn(200, output_statuses,
                      f'unexpected error code:\n{stdout}')
        self.assertIn(403, output_statuses,
                      f'unexpected error code:\n{stdout}')
Exemple #2
0
def download_images(opt, task='personality_captions'):
    dpath = os.path.join(opt['datapath'], task)
    image_path = os.path.join(opt['datapath'], 'yfcc_images')
    version = '1.0'
    response = input(
        'Please confirm that you have obtained permission '
        'to work with the YFCC100m dataset, as outlined by the steps '
        'listed at '
        'https://multimediacommons.wordpress.com/yfcc100m-core-dataset/ [Y/y]: '
    )
    if response.lower() != 'y':
        raise RuntimeError(
            'In order to use the images from this dataset, '
            'you must obtain permission from the webpage above.')
    response = input(
        'NOTE: This script will download each image individually from the '
        's3 server on which the images are hosted. This will take a *very '
        'long* time. Are you sure you would like to continue? [Y/y]: ')
    if response.lower() != 'y':
        raise RuntimeError('If you have access to the images, please specify '
                           'the path to the folder via the `--yfcc-path` '
                           'command line argument.')
    image_prefix = 'https://multimedia-commons.s3-us-west-2.amazonaws.com/data/images'
    hashes = []
    dts = ['train', 'val', 'test']
    if task == 'image_chat':
        dts[1] = 'valid'
    for dt in dts:
        with open(os.path.join(dpath, '{}.json'.format(dt))) as f:
            data = json.load(f)
            hashes += [d['image_hash'] for d in data]
    os.makedirs(image_path, exist_ok=True)

    print('[downloading images to {}]'.format(image_path))
    image_urls = [
        f"{image_prefix}/{p_hash[:3]}/{p_hash[3:6]}/{p_hash}.jpg"
        for p_hash in hashes
    ]
    download_multiprocess(image_urls,
                          image_path,
                          dest_filenames=[f"{h}.jpg" for h in hashes])
    build_data.mark_done(image_path, version)
    def _download_images(self, opt: Opt):
        """
        Download available IGC images.
        """
        urls = []
        ids = []
        for dt in ['test', 'val']:
            df = os.path.join(self.get_data_path(opt), f'IGC_crowd_{dt}.csv')
            with open(df, newline='\n') as csv_file:
                reader = csv.reader(csv_file, delimiter=',')
                fields = []
                for i, row in enumerate(reader):
                    if i == 0:
                        fields = row
                    else:
                        data = dict(zip(fields, row))
                        urls.append(data['url'])
                        ids.append(data['id'])
        os.makedirs(self.get_image_path(opt), exist_ok=True)
        # Make one blank image
        image = Image.new('RGB', (100, 100), color=0)
        image.save(os.path.join(self.get_image_path(opt), self.blank_image_id),
                   'JPEG')
        # Download the rest
        download_multiprocess(urls,
                              self.get_image_path(opt),
                              dest_filenames=ids)

        # Remove bad images
        for fp in os.listdir(self.get_image_path(opt)):
            img_path = os.path.join(self.get_image_path(opt), fp)
            if os.path.isfile(img_path):
                try:
                    Image.open(img_path).convert('RGB')
                except OSError:
                    os.remove(img_path)
Exemple #4
0
    def test_download_multiprocess(self):
        urls = [
            'http://parl.ai/downloads/mnist/mnist.tar.gz',
            'http://parl.ai/downloads/mnist/mnist.tar.gz.BAD',
            'http://parl.ai/downloads/mnist/mnist.tar.gz.BAD',
        ]

        download_results = build_data.download_multiprocess(
            urls, self.datapath, dest_filenames=self.dest_filenames)

        output_filenames, output_statuses, output_errors = zip(
            *download_results)
        self.assertEqual(output_filenames, self.dest_filenames,
                         'output filenames not correct')
        self.assertEqual(output_statuses, (200, 403, 403),
                         'output http statuses not correct')
Exemple #5
0
    def test_download_multiprocess_chunks(self):
        # Tests that the three finish downloading but may finish in any order
        urls = [
            'https://parl.ai/downloads/mnist/mnist.tar.gz',
            'https://parl.ai/downloads/mnist/mnist.tar.gz.BAD',
            'https://parl.ai/downloads/mnist/mnist.tar.gz.BAD',
        ]

        download_results = build_data.download_multiprocess(
            urls, self.datapath, dest_filenames=self.dest_filenames, chunk_size=1
        )

        output_filenames, output_statuses, output_errors = zip(*download_results)

        self.assertIn('mnist0.tar.gz', output_filenames)
        self.assertIn('mnist1.tar.gz', output_filenames)
        self.assertIn('mnist2.tar.gz', output_filenames)
        self.assertIn(200, output_statuses, 'unexpected error code')
        self.assertIn(403, output_statuses, 'unexpected error code')
Exemple #6
0
    def test_download_multiprocess(self):
        urls = [
            'https://parl.ai/downloads/mnist/mnist.tar.gz',
            'https://parl.ai/downloads/mnist/mnist.tar.gz.BAD',
            'https://parl.ai/downloads/mnist/mnist.tar.gz.BAD',
        ]

        with testing_utils.capture_output() as stdout:
            download_results = build_data.download_multiprocess(
                urls, self.datapath, dest_filenames=self.dest_filenames)
        stdout = stdout.getvalue()

        output_filenames, output_statuses, output_errors = zip(
            *download_results)
        self.assertEqual(
            output_filenames,
            self.dest_filenames,
            f'output filenames not correct\n{stdout}',
        )
        self.assertEqual(
            output_statuses,
            (200, 403, 403),
            f'output http statuses not correct\n{stdout}',
        )