def download_file(cls, uri, fobj): """ Given a URI, download the file to the given file-like object. :param str uri: The URI of a file to download. :param file fobj: A file-like object to download the file to. :rtype: file :returns: A file handle to the downloaded file. """ # Breaks the URI into usable componenents. values = get_values_from_media_uri(uri) conn = cls._get_aws_s3_connection(values['username'], values['password']) bucket = conn.get_bucket(values['host']) key = bucket.get_key(values['path']) logger.debug("S3Backend.download_file(): " \ "Downloading: %s" % uri) dlhandler = ResumableDownloadHandler(num_retries=10) try: dlhandler.get_file(key, fobj, None) except AttributeError: # Raised by ResumableDownloadHandler in boto when the given S3 # key can't be found. message = "The specified input file cannot be found." raise InfileNotFoundException(message) logger.debug("S3Backend.download_file(): " \ "Download of %s completed." % uri) return fobj
def test_failed_download_with_persistent_tracker(self): """ Tests that failed resumable download leaves a correct tracker file """ harness = CallbackTestHarness() tmpdir = self._MakeTempDir() tracker_file_name = self.make_tracker_file(tmpdir) dst_fp = self.make_dst_fp(tmpdir) res_download_handler = ResumableDownloadHandler( tracker_file_name=tracker_file_name, num_retries=0) small_src_key_as_string, small_src_key = self.make_small_key() try: small_src_key.get_contents_to_file( dst_fp, cb=harness.call, res_download_handler=res_download_handler) self.fail('Did not get expected ResumableDownloadException') except ResumableDownloadException, e: # We'll get a ResumableDownloadException at this point because # of CallbackTestHarness (above). Check that the tracker file was # created correctly. self.assertEqual(e.disposition, ResumableTransferDisposition.ABORT_CUR_PROCESS) self.assertTrue(os.path.exists(tracker_file_name)) f = open(tracker_file_name) etag_line = f.readline() self.assertEquals(etag_line.rstrip('\n'), small_src_key.etag.strip('"\''))
def test_download_with_file_content_change_during_download(self): """ Tests resumable download on an object where the file content changes without changing length while download in progress """ harnass = CallbackTestHarnass( fail_after_n_bytes=self.larger_src_key_size / 2, num_times_to_fail=2) # Set up first process' ResumableDownloadHandler not to do any # retries (initial download request will establish expected size to # download server). res_download_handler = ResumableDownloadHandler( tracker_file_name=self.tracker_file_name, num_retries=0) dst_filename = self.dst_fp.name try: self.larger_src_key.get_contents_to_file( self.dst_fp, cb=harnass.call, res_download_handler=res_download_handler) self.fail('Did not get expected ResumableDownloadException') except ResumableDownloadException, e: # First abort (from harnass-forced failure) should be # ABORT_CUR_PROCESS. self.assertEqual(e.disposition, ResumableTransferDisposition.ABORT_CUR_PROCESS) # Ensure a tracker file survived. self.assertTrue(os.path.exists(self.tracker_file_name))
def test_multiple_in_process_failures_then_succeed_with_tracker_file(self): """ Tests resumable download that fails completely in one process, then when restarted completes, using a tracker file """ # Set up test harness that causes more failures than a single # ResumableDownloadHandler instance will handle, writing enough data # before the first failure that some of it survives that process run. harness = CallbackTestHarness( fail_after_n_bytes=LARGE_KEY_SIZE/2, num_times_to_fail=2) larger_src_key_as_string = os.urandom(LARGE_KEY_SIZE) larger_src_key = self._MakeKey(data=larger_src_key_as_string) tmpdir = self._MakeTempDir() tracker_file_name = self.make_tracker_file(tmpdir) dst_fp = self.make_dst_fp(tmpdir) res_download_handler = ResumableDownloadHandler( tracker_file_name=tracker_file_name, num_retries=0) try: larger_src_key.get_contents_to_file( dst_fp, cb=harness.call, res_download_handler=res_download_handler) self.fail('Did not get expected ResumableDownloadException') except ResumableDownloadException, e: self.assertEqual(e.disposition, ResumableTransferDisposition.ABORT_CUR_PROCESS) # Ensure a tracker file survived. self.assertTrue(os.path.exists(tracker_file_name))
def _pull_key(self, key, fname): Logger.debug("Pulling key '{}' from bucket '{}' to file '{}'".format( key.name, key.bucket.name, fname)) self._makedirs(fname) tmp_file = self.tmp_file(fname) name = os.path.basename(fname) if self._cmp_checksum(key, fname): Logger.debug('File "{}" matches with "{}".'.format( fname, key.name)) return fname Logger.debug('Downloading cache file from S3 "{}/{}" to "{}"'.format( key.bucket.name, key.name, fname)) res_h = ResumableDownloadHandler( tracker_file_name=self._download_tracker(tmp_file), num_retries=10) try: key.get_contents_to_filename(tmp_file, cb=create_cb(name), res_download_handler=res_h) except Exception as exc: Logger.error('Failed to download "{}": {}'.format(key.name, exc)) return None os.rename(tmp_file, fname) progress.finish_target(name) Logger.debug('Downloading completed') return fname
def test_download_with_inital_partial_download_before_failure(self): """ Tests resumable download that successfully downloads some content before it fails, then restarts and completes """ # Set up harness to fail download after several hundred KB so download # server will have saved something before we retry. harness = CallbackTestHarness( fail_after_n_bytes=LARGE_KEY_SIZE/2) larger_src_key_as_string = os.urandom(LARGE_KEY_SIZE) larger_src_key = self._MakeKey(data=larger_src_key_as_string) res_download_handler = ResumableDownloadHandler(num_retries=1) dst_fp = self.make_dst_fp() larger_src_key.get_contents_to_file( dst_fp, cb=harness.call, res_download_handler=res_download_handler) # Ensure downloaded object has correct content. self.assertEqual(LARGE_KEY_SIZE, get_cur_file_size(dst_fp)) self.assertEqual(larger_src_key_as_string, larger_src_key.get_contents_as_string()) # Ensure some of the file was downloaded both before and after failure. self.assertTrue( len(harness.transferred_seq_before_first_failure) > 1 and len(harness.transferred_seq_after_first_failure) > 1)
def test_failed_download_with_persistent_tracker(self): """ Tests that failed resumable download leaves a correct tracker file """ harnass = CallbackTestHarnass() res_download_handler = ResumableDownloadHandler( tracker_file_name=self.tracker_file_name, num_retries=0) try: self.small_src_key.get_contents_to_file( self.dst_fp, cb=harnass.call, res_download_handler=res_download_handler) self.fail('Did not get expected ResumableDownloadException') except ResumableDownloadException, e: # We'll get a ResumableDownloadException at this point because # of CallbackTestHarnass (above). Check that the tracker file was # created correctly. self.assertEqual(e.disposition, ResumableTransferDisposition.ABORT_CUR_PROCESS) self.assertTrue(os.path.exists(self.tracker_file_name)) f = open(self.tracker_file_name) etag_line = f.readline() m = re.search(ResumableDownloadHandler.ETAG_REGEX, etag_line) f.close() self.assertTrue(m)
def _import(self, bucket_name, key_name, fname, data_item): bucket = self._get_bucket_aws(bucket_name) tmp_file = self.tmp_file(fname) name = os.path.basename(fname) key = bucket.get_key(key_name) if not key: Logger.error( 'File "{}" does not exist in the cloud'.format(key_name)) return None if self._cmp_checksum(key, fname): Logger.debug('File "{}" matches with "{}".'.format( fname, key_name)) return data_item Logger.debug('Downloading cache file from S3 "{}/{}" to "{}"'.format( bucket.name, key_name, fname)) res_h = ResumableDownloadHandler( tracker_file_name=self._download_tracker(tmp_file), num_retries=10) try: key.get_contents_to_filename(tmp_file, cb=create_cb(name), res_download_handler=res_h) os.rename(tmp_file, fname) except Exception as exc: Logger.error('Failed to download "{}": {}'.format(key_name, exc)) return None progress.finish_target(name) Logger.debug('Downloading completed') return data_item
def test_download_with_inconsistent_etag_in_tracker(self): """ Tests resumable download with an inconsistent etag in tracker file """ tmp_dir = self._MakeTempDir() dst_fp = self.make_dst_fp(tmp_dir) small_src_key_as_string, small_src_key = self.make_small_key() inconsistent_etag_tracker_file_name = os.path.join(tmp_dir, 'inconsistent_etag_tracker') f = open(inconsistent_etag_tracker_file_name, 'w') good_etag = small_src_key.etag.strip('"\'') new_val_as_list = [] for c in reversed(good_etag): new_val_as_list.append(c) f.write('%s\n' % ''.join(new_val_as_list)) f.close() res_download_handler = ResumableDownloadHandler( tracker_file_name=inconsistent_etag_tracker_file_name) # An error should be printed about the expired tracker, but then it # should run the update successfully. small_src_key.get_contents_to_file( dst_fp, res_download_handler=res_download_handler) self.assertEqual(SMALL_KEY_SIZE, get_cur_file_size(dst_fp)) self.assertEqual(small_src_key_as_string, small_src_key.get_contents_as_string())
def test_zero_length_object_download(self): """ Tests downloading a zero-length object (exercises boundary conditions). """ res_download_handler = ResumableDownloadHandler() self.empty_src_key.get_contents_to_file( self.dst_fp, res_download_handler=res_download_handler) self.assertEqual(0, get_cur_file_size(self.dst_fp))
def test_zero_length_object_download(self): """ Tests downloading a zero-length object (exercises boundary conditions). """ res_download_handler = ResumableDownloadHandler() dst_fp = self.make_dst_fp() k = self._MakeKey() k.get_contents_to_file(dst_fp, res_download_handler=res_download_handler) self.assertEqual(0, get_cur_file_size(dst_fp))
def test_download_without_persistent_tracker(self): """ Tests a single resumable download, with no tracker persistence """ res_download_handler = ResumableDownloadHandler() self.small_src_key.get_contents_to_file( self.dst_fp, res_download_handler=res_download_handler) self.assertEqual(self.small_src_key_size, get_cur_file_size(self.dst_fp)) self.assertEqual(self.small_src_key_as_string, self.small_src_key.get_contents_as_string())
def test_download_without_persistent_tracker(self): """ Tests a single resumable download, with no tracker persistence """ res_download_handler = ResumableDownloadHandler() dst_fp = self.make_dst_fp() small_src_key_as_string, small_src_key = self.make_small_key() small_src_key.get_contents_to_file( dst_fp, res_download_handler=res_download_handler) self.assertEqual(SMALL_KEY_SIZE, get_cur_file_size(dst_fp)) self.assertEqual(small_src_key_as_string, small_src_key.get_contents_as_string())
def test_multiple_in_process_failures_then_succeed(self): """ Tests resumable download that fails twice in one process, then completes """ res_download_handler = ResumableDownloadHandler(num_retries=3) self.small_src_key.get_contents_to_file( self.dst_fp, res_download_handler=res_download_handler) # Ensure downloaded object has correct content. self.assertEqual(self.small_src_key_size, get_cur_file_size(self.dst_fp)) self.assertEqual(self.small_src_key_as_string, self.small_src_key.get_contents_as_string())
def download_file(self, filename, destination=None): if not destination: destination = filename key = self.bucket.get_key(filename) handler = ResumableDownloadHandler(tempfile.mktemp()) handler._save_tracker_info(key) # Ugly but necessary self.rate_limiter.reset( self.rate_limit, -1 ) with file(destination, "a") as f: handler.get_file(key, f, {}, cb=self.callbacks, num_cb=-1) #validate md5 = utils.md5_for_large_file(destination) s3_md5 = key.metadata.get('nimbus-md5', None) if s3_md5: if s3_md5 != md5: raise utils.Md5CheckError("md5 mismatch") else: if not '-' in key.etag: s3_md5 = key.etag.strip('"\'') if md5 != s3_md5: raise utils.Md5CheckError("md5 mismatch") else: size = os.path.getsize(destination) if size != key.size: raise FileSizeError("error")
def test_download_with_unwritable_tracker_file(self): """ Tests resumable download with an unwritable tracker file """ # Make dir where tracker_file lives temporarily unwritable. save_mod = os.stat(self.tmp_dir).st_mode try: os.chmod(self.tmp_dir, 0) res_download_handler = ResumableDownloadHandler( tracker_file_name=self.tracker_file_name) except ResumableDownloadException, e: self.assertEqual(e.disposition, ResumableTransferDisposition.ABORT) self.assertNotEqual( e.message.find('Couldn\'t write URI tracker file'), -1)
def test_non_retryable_exception_handling(self): """ Tests resumable download that fails with a non-retryable exception """ harnass = CallbackTestHarnass( exception=OSError(errno.EACCES, 'Permission denied')) res_download_handler = ResumableDownloadHandler(num_retries=1) try: self.small_src_key.get_contents_to_file( self.dst_fp, cb=harnass.call, res_download_handler=res_download_handler) self.fail('Did not get expected OSError') except OSError, e: # Ensure the error was re-raised. self.assertEqual(e.errno, 13)
def test_broken_pipe_recovery(self): """ Tests handling of a Broken Pipe (which interacts with an httplib bug) """ exception = IOError(errno.EPIPE, "Broken pipe") harnass = CallbackTestHarnass(exception=exception) res_download_handler = ResumableDownloadHandler(num_retries=1) self.small_src_key.get_contents_to_file( self.dst_fp, cb=harnass.call, res_download_handler=res_download_handler) # Ensure downloaded object has correct content. self.assertEqual(self.small_src_key_size, get_cur_file_size(self.dst_fp)) self.assertEqual(self.small_src_key_as_string, self.small_src_key.get_contents_as_string())
def test_retryable_exception_recovery(self): """ Tests handling of a retryable exception """ # Test one of the RETRYABLE_EXCEPTIONS. exception = ResumableDownloadHandler.RETRYABLE_EXCEPTIONS[0] harnass = CallbackTestHarnass(exception=exception) res_download_handler = ResumableDownloadHandler(num_retries=1) self.small_src_key.get_contents_to_file( self.dst_fp, cb=harnass.call, res_download_handler=res_download_handler) # Ensure downloaded object has correct content. self.assertEqual(self.small_src_key_size, get_cur_file_size(self.dst_fp)) self.assertEqual(self.small_src_key_as_string, self.small_src_key.get_contents_as_string())
def _GetTransferHandlers(self, uri, key, file_size, upload): """ Selects upload/download and callback handlers. We use a callback handler that shows a simple textual progress indicator if file_size is above the configurable threshold. We use a resumable transfer handler if file_size is >= the configurable threshold and resumable transfers are supported by the given provider. boto supports resumable downloads for all providers, but resumable uploads are currently only supported by GS. """ config = boto.config resumable_threshold = config.getint('GSUtil', 'resumable_threshold', ONE_MB) if file_size >= resumable_threshold: cb = self._FileCopyCallbackHandler(upload).call num_cb = int(file_size / ONE_MB) resumable_tracker_dir = config.get( 'GSUtil', 'resumable_tracker_dir', os.path.expanduser('~' + os.sep + '.gsutil')) if not os.path.exists(resumable_tracker_dir): os.makedirs(resumable_tracker_dir) if upload: # Encode the src bucket and key into the tracker file name. res_tracker_file_name = (re.sub( '[/\\\\]', '_', 'resumable_upload__%s__%s.url' % (key.bucket.name, key.name))) else: # Encode the fully-qualified src file name into the tracker file name. res_tracker_file_name = (re.sub( '[/\\\\]', '_', 'resumable_download__%s.etag' % (os.path.realpath(uri.object_name)))) tracker_file = '%s%s%s' % (resumable_tracker_dir, os.sep, res_tracker_file_name) if upload: if uri.scheme == 'gs': transfer_handler = ResumableUploadHandler(tracker_file) else: transfer_handler = None else: transfer_handler = ResumableDownloadHandler(tracker_file) else: transfer_handler = None cb = None num_cb = None return (cb, num_cb, transfer_handler)
def get_file(self, obj_path, file_path, tracker_path): key = self.bucket.get_key(obj_path) if key is None: return None handler = ResumableDownloadHandler(tracker_path) filename = os.path.basename(obj_path) with make_progress_bar(filename, key.size) as pbar: def callback(total_xfer, total_size): pbar.update(total_xfer) with open(file_path, 'ab') as file_: key.get_contents_to_file(file_, cb=callback, num_cb=NUM_CB, res_download_handler=handler)
def test_failed_and_restarted_download_with_persistent_tracker(self): """ Tests resumable download that fails once and then completes, with tracker file """ harnass = CallbackTestHarnass() res_download_handler = ResumableDownloadHandler( tracker_file_name=self.tracker_file_name, num_retries=1) self.small_src_key.get_contents_to_file( self.dst_fp, cb=harnass.call, res_download_handler=res_download_handler) # Ensure downloaded object has correct content. self.assertEqual(self.small_src_key_size, get_cur_file_size(self.dst_fp)) self.assertEqual(self.small_src_key_as_string, self.small_src_key.get_contents_as_string()) # Ensure tracker file deleted. self.assertFalse(os.path.exists(self.tracker_file_name))
def test_download_with_unwritable_tracker_file(self): """ Tests resumable download with an unwritable tracker file """ # Make dir where tracker_file lives temporarily unwritable. tmp_dir = self._MakeTempDir() tracker_file_name = os.path.join(tmp_dir, 'tracker') save_mod = os.stat(tmp_dir).st_mode try: os.chmod(tmp_dir, 0) res_download_handler = ResumableDownloadHandler( tracker_file_name=tracker_file_name) except ResumableDownloadException as e: self.assertEqual(e.disposition, ResumableTransferDisposition.ABORT) self.assertNotEqual( e.message.find('Couldn\'t write URI tracker file'), -1) finally: # Restore original protection of dir where tracker_file lives. os.chmod(tmp_dir, save_mod)
def test_download_with_invalid_tracker_etag(self): """ Tests resumable download with a tracker file containing an invalid etag """ invalid_etag_tracker_file_name = ( '%s%sinvalid_etag_tracker' % (self.tmp_dir, os.sep)) f = open(invalid_etag_tracker_file_name, 'w') f.write('3.14159\n') f.close() res_download_handler = ResumableDownloadHandler( tracker_file_name=invalid_etag_tracker_file_name) # An error should be printed about the invalid tracker, but then it # should run the update successfully. self.small_src_key.get_contents_to_file( self.dst_fp, res_download_handler=res_download_handler) self.assertEqual(self.small_src_key_size, get_cur_file_size(self.dst_fp)) self.assertEqual(self.small_src_key_as_string, self.small_src_key.get_contents_as_string())
def test_failed_and_restarted_download_with_persistent_tracker(self): """ Tests resumable download that fails once and then completes, with tracker file """ harness = CallbackTestHarness() tmpdir = self._MakeTempDir() tracker_file_name = self.make_tracker_file(tmpdir) dst_fp = self.make_dst_fp(tmpdir) small_src_key_as_string, small_src_key = self.make_small_key() res_download_handler = ResumableDownloadHandler( tracker_file_name=tracker_file_name, num_retries=1) small_src_key.get_contents_to_file( dst_fp, cb=harness.call, res_download_handler=res_download_handler) # Ensure downloaded object has correct content. self.assertEqual(SMALL_KEY_SIZE, get_cur_file_size(dst_fp)) self.assertEqual(small_src_key_as_string, small_src_key.get_contents_as_string()) # Ensure tracker file deleted. self.assertFalse(os.path.exists(tracker_file_name))
def test_download_with_invalid_tracker_etag(self): """ Tests resumable download with a tracker file containing an invalid etag """ tmp_dir = self._MakeTempDir() dst_fp = self.make_dst_fp(tmp_dir) small_src_key_as_string, small_src_key = self.make_small_key() invalid_etag_tracker_file_name = os.path.join(tmp_dir, 'invalid_etag_tracker') f = open(invalid_etag_tracker_file_name, 'w') f.write('3.14159\n') f.close() res_download_handler = ResumableDownloadHandler( tracker_file_name=invalid_etag_tracker_file_name) # An error should be printed about the invalid tracker, but then it # should run the update successfully. small_src_key.get_contents_to_file( dst_fp, res_download_handler=res_download_handler) self.assertEqual(SMALL_KEY_SIZE, get_cur_file_size(dst_fp)) self.assertEqual(small_src_key_as_string, small_src_key.get_contents_as_string())
def test_multiple_in_process_failures_then_succeed_with_tracker_file(self): """ Tests resumable download that fails completely in one process, then when restarted completes, using a tracker file """ # Set up test harnass that causes more failures than a single # ResumableDownloadHandler instance will handle, writing enough data # before the first failure that some of it survives that process run. harnass = CallbackTestHarnass( fail_after_n_bytes=self.larger_src_key_size / 2, num_times_to_fail=2) res_download_handler = ResumableDownloadHandler( tracker_file_name=self.tracker_file_name, num_retries=0) try: self.larger_src_key.get_contents_to_file( self.dst_fp, cb=harnass.call, res_download_handler=res_download_handler) self.fail('Did not get expected ResumableDownloadException') except ResumableDownloadException, e: self.assertEqual(e.disposition, ResumableTransferDisposition.ABORT) # Ensure a tracker file survived. self.assertTrue(os.path.exists(self.tracker_file_name))
def test_download_with_object_size_change_between_starts(self): """ Tests resumable download on an object that changes sizes between inital download start and restart """ harnass = CallbackTestHarnass( fail_after_n_bytes=self.larger_src_key_size / 2, num_times_to_fail=2) # Set up first process' ResumableDownloadHandler not to do any # retries (initial download request will establish expected size to # download server). res_download_handler = ResumableDownloadHandler( tracker_file_name=self.tracker_file_name, num_retries=0) try: self.larger_src_key.get_contents_to_file( self.dst_fp, cb=harnass.call, res_download_handler=res_download_handler) self.fail('Did not get expected ResumableDownloadException') except ResumableDownloadException, e: self.assertEqual(e.disposition, ResumableTransferDisposition.ABORT) # Ensure a tracker file survived. self.assertTrue(os.path.exists(self.tracker_file_name))
def test_download_with_inconsistent_etag_in_tracker(self): """ Tests resumable download with an inconsistent etag in tracker file """ inconsistent_etag_tracker_file_name = ( '%s%sinconsistent_etag_tracker' % (self.tmp_dir, os.sep)) f = open(inconsistent_etag_tracker_file_name, 'w') good_etag = self.small_src_key.etag.strip('"\'') new_val_as_list = [] for c in reversed(good_etag): new_val_as_list.append(c) f.write('%s\n' % ''.join(new_val_as_list)) f.close() res_download_handler = ResumableDownloadHandler( tracker_file_name=inconsistent_etag_tracker_file_name) # An error should be printed about the expired tracker, but then it # should run the update successfully. self.small_src_key.get_contents_to_file( self.dst_fp, res_download_handler=res_download_handler) self.assertEqual(self.small_src_key_size, get_cur_file_size(self.dst_fp)) self.assertEqual(self.small_src_key_as_string, self.small_src_key.get_contents_as_string())