def _upload_s3(datafiles, job_id, bucket_name='infernyx'): rval = [] conn = boto.connect_s3() bucket = conn.get_bucket(bucket_name, validate=False) for tmp_file_list, _, tablename, columns in datafiles: s3_entries = [] for tmpfile in tmp_file_list: with open(tmpfile) as f: md5 = compute_md5(f) k = Key(bucket) k.key = "%s-%s" % (job_id, tmpfile) _log(job_id, "->S3 %s/%s" % (bucket_name, k.key)) k.set_contents_from_filename(tmpfile, md5=md5, replace=True) s3_entry = {"url": "s3://%s/%s" % (bucket_name, k.key), "mandatory": True} s3_entries.append(s3_entry) # upload the manifest prefix = tmp_file_list[0].rsplit('.')[0] manifest = ujson.dumps({"entries": s3_entries}) manifest_key = Key(bucket) manifest_key.key = "%s.%s.manifest" % (job_id, prefix) _log(job_id, "->S3 %s/%s: %s" % (bucket_name, manifest_key.key, manifest)) manifest_key.set_contents_from_string(manifest) # store manifest rval.append(DataFile(tmp_file_list, (bucket_name, manifest_key.key), tablename, columns)) return rval
def upload_file(bucket, key_name, file_path, remote_prefix=None, policy='public-read', metadata=None): if not metadata: metadata = {} if remote_prefix: key_name = '{0}/{1}'.format(remote_prefix, key_name) fd = open(file_path) md5 = compute_md5(fd) fd.close() current_md5 = None current_key = bucket.lookup(key_name) if current_key: current_md5 = current_key.get_metadata('fabix-md5') if current_md5 == md5[0]: for k, v in metadata.iteritems(): current_key.set_metadata(k, v) puts("Skip file {0}".format(file_path)) return current_key key = bucket.new_key(key_name) for k, v in metadata.iteritems(): key.set_metadata(k, v) key.set_metadata('fabix-md5', md5[0]) puts("Upload file {0}".format(file_path)) key.set_contents_from_filename(file_path, md5=md5, policy=policy) return key
def setUp(self): self.mock_s3 = moto.mock_s3() self.mock_s3.start() self.s3_conn = boto.connect_s3() self.s3_conn.create_bucket('source_bucket') self.source_bucket = self.s3_conn.get_bucket('source_bucket') self.data = [ { 'name': 'Roberto', 'birthday': '12/05/1987' }, { 'name': 'Claudia', 'birthday': '21/12/1985' }, ] with closing( self.source_bucket.new_key('some_prefix/test_key')) as key: with TmpFile() as tmp_filename: with open(tmp_filename, 'w') as f: f.write(json.dumps(self.data)) with open(tmp_filename) as f: self.key_md5 = compute_md5(f) key.metadata = {'total': 2, 'md5': self.key_md5} key.set_contents_from_string(json.dumps(self.data)) self.tmp_bypass_resume_file = 'tests/data/tmp_s3_bypass_resume_persistence.pickle' shutil.copyfile('tests/data/s3_bypass_resume_persistence.pickle', self.tmp_bypass_resume_file)
def putter(put, put_queue, stat_queue, options): logger = logging.getLogger( '%s[putter-%d]' % (os.path.basename(sys.argv[0]), current_process().pid)) connection, bucket = None, None file_object_cache = FileObjectCache() while True: args = put_queue.get() if args is None: put_queue.task_done() break key_name, value_kwargs = args value = Value(file_object_cache, **value_kwargs) try: if connection is None: connection = S3Connection(is_secure=options.secure, host=options.host) if bucket is None: bucket = connection.get_bucket(options.bucket) key = put(bucket, key_name, value) if key: if options.headers: headers = dict( tuple(header.split(':', 1)) for header in options.headers) else: headers = {} if options.content_type: if options.content_type == "guess": headers['Content-Type'] = mimetypes.guess_type( value.path)[0] else: headers['Content-Type'] = options.content_type content = value.get_content() md5 = value.md5 if options.gzip: headers['Content-Encoding'] = 'gzip' string_io = StringIO() gzip_file = GzipFile(compresslevel=9, fileobj=string_io, mode='w') gzip_file.write(content) gzip_file.close() content = string_io.getvalue() md5 = compute_md5(StringIO(content)) if not options.dry_run: key.set_contents_from_string(content, headers, md5=md5, policy=options.grant) logger.info('%s -> %s' % (value.path, key.name)) stat_queue.put(dict(size=value.get_size())) else: logger.info('skipping %s -> %s' % (value.path, key_name)) except SSLError as exc: logger.error('%s -> %s (%s)' % (value.path, key_name, exc)) put_queue.put(args) connection, bucket = None, None put_queue.task_done()
def putter(put, put_queue, stat_queue, options): pid=current_process().pid log = logging.getLogger(os.path.basename(sys.argv[0])) connection, bucket = None, None file_object_cache = FileObjectCache() if options.gzip: pass #log.debug('These content types will be gzipped: %s' % unicode(gzip_content_types)) while True: args = put_queue.get() #print args, pid if args is None: put_queue.task_done() break key_name, value_kwargs = args #print(666,value_kwargs) if options.gzip: key_name = '%s.gz' % key_name value = Value(file_object_cache, **value_kwargs) try: if connection is None: connection = S3Connection(is_secure=options.secure, host=options.host) if bucket is None: bucket = connection.get_bucket(options.bucket, validate=False) key = put(bucket, key_name, value) if key: if value.should_copy_content(): if options.headers: headers = dict(tuple(header.split(':', 1)) for header in options.headers) else: headers = {} content = value.get_content() if options.gzip: headers['Content-Encoding'] = 'gzip' string_io = StringIO() gzip_file = GzipFile(compresslevel=1, fileobj=string_io, mode='w') gzip_file.write(content) gzip_file.close() content = string_io.getvalue() md5 = compute_md5(StringIO(content)) if not options.dry_run: key.set_contents_from_string(content, headers, md5=md5, policy=options.grant, encrypt_key=options.encrypt_key) #log.info('%s %s> %s' % (value.path, 'z' if options.gzip else '-', key.name)) stat_queue.put(dict(size=value.get_size())) else: log.info('skipping %s -> %s' % (value.path, key_name)) except SSLError as exc: log.error('%s -> %s (%s)' % (value.path, key_name, exc)) put_queue.put(args) connection, bucket = None, None except IOError as exc: log.error('%s -> %s (%s)' % (value.path, key_name, exc)) put_queue.task_done()
def __get__(self, instance, owner): try: return (instance.md5, instance.b64md5) except AttributeError: instance.seek(0) (instance.md5, instance.b64md5, size) = compute_md5( instance, instance.bytes) instance.seek(0) return (instance.md5, instance.b64md5)
def test_catches_bad_md5(self): '''Make sure Riak CS catches a bad content-md5 header''' key_name = str(uuid.uuid4()) bucket = self.conn.create_bucket(self.bucket_name) key = Key(bucket, key_name) s = StringIO('not the real content') x = compute_md5(s) with self.assertRaises(S3ResponseError): key.set_contents_from_string('this is different from the md5 we calculated', md5=x)
def compute_localfile_md5sum(localfile): """ Compute the hex-digested md5 checksum of the given ``localfile``. :param localfile: Path to a file on the local filesystem. """ fp = open(localfile, 'rb') md5sum = compute_md5(fp)[0] fp.close() return md5sum
def _create_key_metadata(self, dump_path, md5=None): from boto.utils import compute_md5 metadata = {} metadata['total'] = self._get_total_count(dump_path) if md5: metadata['md5'] = md5 else: with open(dump_path, 'r') as f: metadata['md5'] = compute_md5(f) return metadata
def compute_localfile_md5sum(localfile): """ Compute the hex-digested md5 checksum of the given ``localfile``. :param localfile: Path to a file on the local filesystem. """ fp = open(localfile, "rb") md5sum = compute_md5(fp)[0] fp.close() return md5sum
def upload_file(bucket, key_name, file_path, remote_prefix=None, policy='public-read'): if remote_prefix: key_name = '{0}/{1}'.format(remote_prefix, key_name) key = bucket.new_key(key_name) fd = open(file_path) md5 = compute_md5(fd) fd.close() key.set_metadata('fabix-md5', md5[0]) key.set_contents_from_filename(file_path, md5=md5, policy=policy) return key
def _MakeFile(file_size): """Creates a temporary file of the given size and returns its path.""" fd, fpath = tempfile.mkstemp(suffix='.bin', prefix='gsutil_test_file', text=False) self.file_sizes[fpath] = file_size self.file_contents[fpath] = os.urandom(file_size) with os.fdopen(fd, 'wb') as f: f.write(self.file_contents[fpath]) with open(fpath, 'rb') as f: self.file_md5s[fpath] = compute_md5(f) return fpath
def _upload_s3(datafile, key_id, access_key, bucket_name, key): with open(datafile) as f: md5 = compute_md5(f) conn = boto.connect_s3(key_id, access_key) bucket = conn.get_bucket(bucket_name, validate=False) k = Key(bucket) k.key = key k.set_contents_from_filename(datafile, md5=md5, replace=True) return "s3://%s/%s" % (bucket_name, k.key)
def filedata(files, sftp, pathmatch='.*', noop=False): """Generator that yields file path, data as a tempfile, and tuple containing the etag/md5. :param files: List of files to work with. :type files: list :param sftp: paramiko.SFTPClient object. :type sftp: paramiko.SFTPClient :param pathmatch: Regex to match files, use to exclude unwatched files. :type pathmatch: str :param noop: Enable No Op to not download the file. :type noop: bool :returns: A generator containing str, tempfile.NamedTemporaryFile, tuple :rtype: generator """ match = re.compile(pathmatch) for filepath in files: if re.search(match, filepath): # boto will set content type by file suffix suffix = os.path.splitext(filepath)[1] data = tempfile.NamedTemporaryFile(suffix=suffix) if not noop: try: # sftp.get checks file sizes, so we don't manually stat sftp.get(filepath, data.name) # Seek file cursor to beginning before md5 data.seek(0) md5 = compute_md5(data) logging.info('Local MD5: ' + str(md5)) yield filepath, data, (md5[0], md5[1]) except IOError: logging.warning('Error downloading: ' + filepath) else: logging.info('NOOP: Downloading from SFTP: ' + filepath) md5 = compute_md5(data) yield filepath, data, (md5[0], md5[1])
def _save_metadata_for_key(self, key, dump_path, md5=None): from boto.exception import S3ResponseError from boto.utils import compute_md5 try: key.set_metadata('total', self._get_total_count(dump_path)) if md5: key.set_metadata('md5', md5) else: with open(dump_path, 'r') as f: key.set_metadata('md5', compute_md5(f)) except S3ResponseError: self.logger.warning( 'We have no READ_ACP/WRITE_ACP permissions, ' 'so we could not add metadata info')
def putter(put, put_queue, stat_queue, options): logger = logging.getLogger('%s[putter-%d]' % (os.path.basename(sys.argv[0]), current_process().pid)) connection, bucket = None, None file_object_cache = FileObjectCache() while True: args = put_queue.get() if args is None: put_queue.task_done() break key_name, value_kwargs = args value = Value(file_object_cache, **value_kwargs) try: if connection is None: connection = S3Connection(is_secure=options.secure, host=options.host) if bucket is None: bucket = connection.get_bucket(options.bucket) key = put(bucket, key_name, value) if key: if options.headers: headers = dict(tuple(header.split(':', 1)) for header in options.headers) else: headers = {} if options.content_type: if options.content_type == "guess": headers['Content-Type'] = mimetypes.guess_type(value.path)[0] else: headers['Content-Type'] = options.content_type content = value.get_content() md5 = value.md5 if options.gzip: headers['Content-Encoding'] = 'gzip' string_io = StringIO() gzip_file = GzipFile(compresslevel=9, fileobj=string_io, mode='w') gzip_file.write(content) gzip_file.close() content = string_io.getvalue() md5 = compute_md5(StringIO(content)) if not options.dry_run: key.set_contents_from_string(content, headers, md5=md5, policy=options.grant) logger.info('%s -> %s' % (value.path, key.name)) stat_queue.put(dict(size=value.get_size())) else: logger.info('skipping %s -> %s' % (value.path, key_name)) except SSLError as exc: logger.error('%s -> %s (%s)' % (value.path, key_name, exc)) put_queue.put(args) connection, bucket = None, None put_queue.task_done()
def _get_md5(self, key, tmp_filename): from boto.utils import compute_md5 import re md5 = None md5_from_metadata = key.get_metadata('md5') if md5_from_metadata: match = re.match("\(\'(.*)\', u\'(.*)\', (.*)\)", str(md5_from_metadata)) if match: groups = match.groups() md5 = (groups[0], unicode(groups[1]), int(groups[2])) # If it's not in metadata, let's compute it if md5 is None: with open(tmp_filename) as f: md5 = compute_md5(f) return md5
def _upload(retries_left=amount_of_retries): try: logging.info("Start uploading part #{0:d} of {1}".format(part_num, file_path)) target_bucket = S3Connection(access_key_id, secret_access_key).get_bucket(target_bucket_name) for mp in target_bucket.get_all_multipart_uploads(): if mp.id == multipart_id: with FileChunkIO(file_path, 'r', offset=offset, bytes=bytes) as fp: hex_digest, base64_digest, data_size = utils.compute_md5(fp, size=bytes) mp.upload_part_from_file(fp=fp, part_num=part_num, cb=cb, num_cb=1, md5=(hex_digest, base64_digest)) break except Exception, exc: if retries_left: _upload(retries_left=retries_left - 1) else: logging.error("Failed uploading part #{0:d} of {1}".format(part_num, file_path)) raise exc
def upload_file(self, file_path): self._currently_processing.add(file_path) key = Key(self.get_target_bucket()) rel_path = str(file_path[self._watched_dir_offset:]) key.key = rel_path if os.path.isfile(file_path) and os.stat(file_path).st_size > self._file_split_threshold_bytes: self.multipart_upload_file(file_path, key.key) else: fp = open(file_path, "r") hex_digest, base64_digest, data_size = utils.compute_md5(fp) key.set_contents_from_filename(file_path, cb=upload_progress_cb, num_cb=1, md5=(hex_digest, base64_digest)) # Check in queue since the same file path may have been added again while this one was uploading if os.path.isfile(file_path) and not self.is_queued(file_path): os.remove(file_path) self._currently_processing.discard(file_path)
def _upload_s3(datafiles, job_id, bucket_name='infernyx'): rval = [] for tmpfile, _, tablename, columns in datafiles: with open(tmpfile) as f: md5 = compute_md5(f) conn = boto.connect_s3() bucket = conn.get_bucket(bucket_name, validate=False) k = Key(bucket) k.key = "%s-%s" % (job_id, tmpfile) k.set_contents_from_filename(tmpfile, md5=md5, replace=True) rval.append(DataFile(tmpfile, (bucket_name, k.key), tablename, columns)) _log(job_id, "->S3 %s/%s" % (bucket_name, k.key)) return rval
def _MakeFile(file_size): """Creates a temporary file of the given size and returns its path.""" fd, fpath = tempfile.mkstemp(suffix='.bin', prefix='gsutil_test_file', text=False) self.file_sizes[fpath] = file_size random_bytes = os.urandom(min(file_size, self.MAX_UNIQUE_RANDOM_BYTES)) total_bytes = 0 file_contents = "" while total_bytes < file_size: num_bytes = min(self.MAX_UNIQUE_RANDOM_BYTES, file_size - total_bytes) file_contents += random_bytes[:num_bytes] total_bytes += num_bytes self.file_contents[fpath] = file_contents with os.fdopen(fd, 'wb') as f: f.write(self.file_contents[fpath]) with open(fpath, 'rb') as f: self.file_md5s[fpath] = compute_md5(f) return fpath
def test_bad_md5_leaves_old_object_alone(self): '''Github #705 Regression test: Make sure that overwriting an object using a bad md5 simply leaves the old version in place.''' key_name = str(uuid.uuid4()) bucket = self.conn.create_bucket(self.bucket_name) value = 'good value' good_key = Key(bucket, key_name) good_key.set_contents_from_string(value) bad_key = Key(bucket, key_name) s = StringIO('not the real content') x = compute_md5(s) try: bad_key.set_contents_from_string('this is different from the md5 we calculated', md5=x) except S3ResponseError: pass self.assertEqual(good_key.get_contents_as_string(), value)
async def _complete_multipart_upload(self, path, session_upload_id, parts_metadata): """This operation completes a multipart upload by assembling previously uploaded parts. Docs: https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadComplete.html """ payload = ''.join([ '<?xml version="1.0" encoding="UTF-8"?><CompleteMultipartUpload>', ''.join([ '<Part><PartNumber>{}</PartNumber><ETag>{}</ETag></Part>'. format(i + 1, xml.sax.saxutils.escape(part['ETAG'])) for i, part in enumerate(parts_metadata) ]), '</CompleteMultipartUpload>', ]).encode('utf-8') headers = { 'Content-Length': str(len(payload)), 'Content-MD5': compute_md5(BytesIO(payload))[1], 'Content-Type': 'text/xml', } params = {'uploadId': session_upload_id} complete_url = functools.partial(self.bucket.new_key( path.path).generate_url, settings.TEMP_URL_SECS, 'POST', query_parameters=params, headers=headers) resp = await self.make_request( 'POST', complete_url, data=payload, headers=headers, params=params, expects=( 200, 201, ), throws=exceptions.UploadError, ) await resp.release()
def compute_md5(self, fp): """ :type fp: file :param fp: File pointer to the file to MD5 hash. The file pointer will be reset to the beginning of the file before the method returns. :rtype: tuple :return: A tuple containing the hex digest version of the MD5 hash as the first element and the base64 encoded version of the plain digest as the second element. """ tup = compute_md5(fp) # Returned values are MD5 hash, base64 encoded MD5 hash, and file size. # The internal implementation of compute_md5() needs to return the # file size but we don't want to return that value to the external # caller because it changes the class interface (i.e. it might # break some code) so we consume the third tuple value here and # return the remainder of the tuple to the caller, thereby preserving # the existing interface. self.size = tup[2] return tup[0:2]
async def _complete_multipart_upload(self, path, session_upload_id, parts_metadata): """This operation completes a multipart upload by assembling previously uploaded parts. Docs: https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadComplete.html """ payload = ''.join([ '<?xml version="1.0" encoding="UTF-8"?><CompleteMultipartUpload>', ''.join( ['<Part><PartNumber>{}</PartNumber><ETag>{}</ETag></Part>'.format( i + 1, xml.sax.saxutils.escape(part['ETAG']) ) for i, part in enumerate(parts_metadata)] ), '</CompleteMultipartUpload>', ]).encode('utf-8') headers = { 'Content-Length': str(len(payload)), 'Content-MD5': compute_md5(BytesIO(payload))[1], 'Content-Type': 'text/xml', } params = {'uploadId': session_upload_id} complete_url = functools.partial( self.bucket.new_key(path.path).generate_url, settings.TEMP_URL_SECS, 'POST', query_parameters=params, headers=headers ) resp = await self.make_request( 'POST', complete_url, data=payload, headers=headers, params=params, expects=(200, 201, ), throws=exceptions.UploadError, ) await resp.release()
def _upload_s3(datafiles, job_id, bucket_name='infernyx'): rval = [] conn = boto.connect_s3() bucket = conn.get_bucket(bucket_name, validate=False) for tmp_file_list, _, tablename, columns in datafiles: s3_entries = [] for tmpfile in tmp_file_list: with open(tmpfile) as f: md5 = compute_md5(f) k = Key(bucket) k.key = "%s-%s" % (job_id, tmpfile) _log(job_id, "->S3 %s/%s" % (bucket_name, k.key)) k.set_contents_from_filename(tmpfile, md5=md5, replace=True) s3_entry = { "url": "s3://%s/%s" % (bucket_name, k.key), "mandatory": True } s3_entries.append(s3_entry) # upload the manifest prefix = tmp_file_list[0].rsplit('.')[0] manifest = ujson.dumps({"entries": s3_entries}) manifest_key = Key(bucket) manifest_key.key = "%s.%s.manifest" % (job_id, prefix) _log(job_id, "->S3 %s/%s: %s" % (bucket_name, manifest_key.key, manifest)) manifest_key.set_contents_from_string(manifest) # store manifest rval.append( DataFile(tmp_file_list, (bucket_name, manifest_key.key), tablename, columns)) return rval
def sync_dir_up(bucket_name, local_path, remote_prefix=None): puts("Sync directory {0} with bucket {1}".format(bucket_name, local_path)) conn = boto.connect_s3() bucket = conn.get_bucket(bucket_name) for root, dirs, files in os.walk(local_path): for fname in files: file_path = os.path.join(root, fname) key_name = get_key_name(local_path, file_path) key = bucket.lookup(key_name) if key: key_md5 = key.get_metadata('fabix-md5') fd = open(file_path) md5 = compute_md5(fd) fd.close() if md5[0] == key_md5: puts("Skipping {0} (MD5 match)".format(file_path)) continue puts("Upload file {0}".format(file_path)) upload_file(bucket, key_name, file_path, remote_prefix=remote_prefix)
def setUp(self): self.mock_s3 = moto.mock_s3() self.mock_s3.start() self.s3_conn = boto.connect_s3() self.s3_conn.create_bucket('source_bucket') self.source_bucket = self.s3_conn.get_bucket('source_bucket') self.data = [ {'name': 'Roberto', 'birthday': '12/05/1987'}, {'name': 'Claudia', 'birthday': '21/12/1985'}, ] with closing(self.source_bucket.new_key('some_prefix/test_key')) as key: with TmpFile() as tmp_filename: with open(tmp_filename, 'w') as f: f.write(json.dumps(self.data)) with open(tmp_filename) as f: self.key_md5 = compute_md5(f) key.metadata = {'total': 2, 'md5': self.key_md5} key.set_contents_from_string(json.dumps(self.data)) self.tmp_bypass_resume_file = 'tests/data/tmp_s3_bypass_resume_persistence.pickle' shutil.copyfile('tests/data/s3_bypass_resume_persistence.pickle', self.tmp_bypass_resume_file)
def _md5_checksum_metadata(source_path): checksum = {} with open(source_path, "rb") as fd: hex_digest, b64_digest, data_size = compute_md5(fd) checksum['b64_digest'] = b64_digest return checksum
def calculate_md5(self): if self.md5 is None: self.md5 = compute_md5(StringIO(self.get_content())) return self.md5
async def _delete_folder(self, path, **kwargs): """Query for recursive contents of folder and delete in batches of 1000 Called from: func: delete if not path.is_file Calls: func: self._check_region func: self.make_request func: self.bucket.generate_url :param *ProviderPath path: Path to be deleted On S3, folders are not first-class objects, but are instead inferred from the names of their children. A regular DELETE request issued against a folder will not work unless that folder is completely empty. To fully delete an occupied folder, we must delete all of the comprising objects. Amazon provides a bulk delete operation to simplify this. """ await self._check_region() more_to_come = True content_keys = [] query_params = {'prefix': path.path} marker = None while more_to_come: if marker is not None: query_params['marker'] = marker resp = await self.make_request( 'GET', self.bucket.generate_url(settings.TEMP_URL_SECS, 'GET', query_parameters=query_params), params=query_params, expects=(200, ), throws=exceptions.MetadataError, ) contents = await resp.read() parsed = xmltodict.parse( contents, strip_whitespace=False)['ListBucketResult'] more_to_come = parsed.get('IsTruncated') == 'true' contents = parsed.get('Contents', []) if isinstance(contents, dict): contents = [contents] content_keys.extend([content['Key'] for content in contents]) if len(content_keys) > 0: marker = content_keys[-1] # Query against non-existant folder does not return 404 if len(content_keys) == 0: raise exceptions.NotFoundError(str(path)) while len(content_keys) > 0: key_batch = content_keys[:1000] del content_keys[:1000] payload = '<?xml version="1.0" encoding="UTF-8"?>' payload += '<Delete>' payload += ''.join( map( lambda x: '<Object><Key>{}</Key></Object>'.format( xml.sax.saxutils.escape(x)), key_batch)) payload += '</Delete>' payload = payload.encode('utf-8') md5 = compute_md5(BytesIO(payload)) query_params = {'delete': ''} headers = { 'Content-Length': str(len(payload)), 'Content-MD5': md5[1], 'Content-Type': 'text/xml', } # We depend on a customized version of boto that can make query parameters part of # the signature. url = functools.partial( self.bucket.generate_url, settings.TEMP_URL_SECS, 'POST', query_parameters=query_params, headers=headers, ) resp = await self.make_request( 'POST', url, params=query_params, data=payload, headers=headers, expects=( 200, 204, ), throws=exceptions.DeleteError, ) await resp.release()
def putter(put, put_queue, stat_queue, options): pid = current_process().pid log = logging.getLogger(os.path.basename(sys.argv[0])) connection, bucket = None, None file_object_cache = FileObjectCache() # Figure out what content types we want to gzip if not options.gzip_type: # default gzip_content_types = GZIP_CONTENT_TYPES elif 'all' in options.gzip_type: gzip_content_types = GZIP_ALL else: gzip_content_types = options.gzip_type if 'guess' in gzip_content_types: # don't bother removing 'guess' from the list since nothing will match it gzip_content_types.extend(GZIP_CONTENT_TYPES) if options.gzip: pass #log.debug('These content types will be gzipped: %s' % unicode(gzip_content_types)) while True: args = put_queue.get() #print args, pid if args is None: put_queue.task_done() break key_name, value_kwargs = args #print(666,value_kwargs) if options.gzip: key_name = '%s.gz' % key_name value = Value(file_object_cache, **value_kwargs) should_gzip = False try: if connection is None: connection = S3Connection(is_secure=options.secure, host=options.host) if bucket is None: bucket = connection.get_bucket(options.bucket, validate=False) key = put(bucket, key_name, value) if key: if value.should_copy_content(): if options.headers: headers = dict( tuple(header.split(':', 1)) for header in options.headers) else: headers = {} content_type = None if options.content_type: if options.content_type == 'guess': content_type = mimetypes.guess_type(value.path)[0] elif options.content_type == 'magic': content_type = mimetypes.guess_type(value.path)[0] if content_type is None: content_type = magic.from_file(value.path, mime=True) else: content_type = options.content_type headers['Content-Type'] = content_type content = value.get_content() md5 = value.md5 should_gzip = options.gzip and ( content_type and content_type in gzip_content_types or gzip_content_types == GZIP_ALL) if should_gzip: headers['Content-Encoding'] = 'gzip' string_io = StringIO() gzip_file = GzipFile(compresslevel=1, fileobj=string_io, mode='w') gzip_file.write(content) gzip_file.close() content = string_io.getvalue() md5 = compute_md5(StringIO(content)) if not options.dry_run: key.set_contents_from_string( content, headers, md5=md5, policy=options.grant, encrypt_key=options.encrypt_key) #log.info('%s %s> %s' % (value.path, 'z' if should_gzip else '-', key.name)) stat_queue.put(dict(size=value.get_size())) else: log.info('skipping %s -> %s' % (value.path, key_name)) except SSLError as exc: log.error('%s -> %s (%s)' % (value.path, key_name, exc)) put_queue.put(args) connection, bucket = None, None except IOError as exc: log.error('%s -> %s (%s)' % (value.path, key_name, exc)) put_queue.task_done()
async def _delete_folder(self, path, **kwargs): """Query for recursive contents of folder and delete in batches of 1000 Called from: func: delete if not path.is_file Calls: func: self._check_region func: self.make_request func: self.bucket.generate_url :param *ProviderPath path: Path to be deleted On S3, folders are not first-class objects, but are instead inferred from the names of their children. A regular DELETE request issued against a folder will not work unless that folder is completely empty. To fully delete an occupied folder, we must delete all of the comprising objects. Amazon provides a bulk delete operation to simplify this. """ await self._check_region() more_to_come = True content_keys = [] query_params = {'prefix': path.path} marker = None while more_to_come: if marker is not None: query_params['marker'] = marker resp = await self.make_request( 'GET', self.bucket.generate_url(settings.TEMP_URL_SECS, 'GET', query_parameters=query_params), params=query_params, expects=(200, ), throws=exceptions.MetadataError, ) contents = await resp.read() parsed = xmltodict.parse(contents, strip_whitespace=False)['ListBucketResult'] more_to_come = parsed.get('IsTruncated') == 'true' contents = parsed.get('Contents', []) if isinstance(contents, dict): contents = [contents] content_keys.extend([content['Key'] for content in contents]) if len(content_keys) > 0: marker = content_keys[-1] # Query against non-existant folder does not return 404 if len(content_keys) == 0: raise exceptions.NotFoundError(str(path)) while len(content_keys) > 0: key_batch = content_keys[:1000] del content_keys[:1000] payload = '<?xml version="1.0" encoding="UTF-8"?>' payload += '<Delete>' payload += ''.join(map( lambda x: '<Object><Key>{}</Key></Object>'.format(xml.sax.saxutils.escape(x)), key_batch )) payload += '</Delete>' payload = payload.encode('utf-8') md5 = compute_md5(BytesIO(payload)) query_params = {'delete': ''} headers = { 'Content-Length': str(len(payload)), 'Content-MD5': md5[1], 'Content-Type': 'text/xml', } # We depend on a customized version of boto that can make query parameters part of # the signature. url = functools.partial( self.bucket.generate_url, settings.TEMP_URL_SECS, 'POST', query_parameters=query_params, headers=headers, ) resp = await self.make_request( 'POST', url, params=query_params, data=payload, headers=headers, expects=(200, 204, ), throws=exceptions.DeleteError, ) await resp.release()