def _get_filelist_remote(remote_uri, recursive = True): ## If remote_uri ends with '/' then all remote files will have ## the remote_uri prefix removed in the relative path. ## If, on the other hand, the remote_uri ends with something else ## (probably alphanumeric symbol) we'll use the last path part ## in the relative path. ## ## Complicated, eh? See an example: ## _get_filelist_remote("s3://bckt/abc/def") may yield: ## { 'def/file1.jpg' : {}, 'def/xyz/blah.txt' : {} } ## _get_filelist_remote("s3://bckt/abc/def/") will yield: ## { 'file1.jpg' : {}, 'xyz/blah.txt' : {} } ## Furthermore a prefix-magic can restrict the return list: ## _get_filelist_remote("s3://bckt/abc/def/x") yields: ## { 'xyz/blah.txt' : {} } info(u"Retrieving list of remote files for %s ..." % remote_uri) s3 = S3(Config()) response = s3.bucket_list(remote_uri.bucket(), prefix = remote_uri.object(), recursive = recursive) rem_base_original = rem_base = remote_uri.object() remote_uri_original = remote_uri if rem_base != '' and rem_base[-1] != '/': rem_base = rem_base[:rem_base.rfind('/')+1] remote_uri = S3Uri("s3://%s/%s" % (remote_uri.bucket(), rem_base)) rem_base_len = len(rem_base) rem_list = FileDict(ignore_case = False) break_now = False for object in response['list']: if object['Key'] == rem_base_original and object['Key'][-1] != "/": ## We asked for one file and we got that file :-) key = os.path.basename(object['Key']) object_uri_str = remote_uri_original.uri() break_now = True rem_list = FileDict(ignore_case = False) ## Remove whatever has already been put to rem_list else: key = object['Key'][rem_base_len:] ## Beware - this may be '' if object['Key']==rem_base !! object_uri_str = remote_uri.uri() + key rem_list[key] = { 'size' : int(object['Size']), 'timestamp' : dateS3toUnix(object['LastModified']), ## Sadly it's upload time, not our lastmod time :-( 'md5' : object['ETag'][1:-1], 'object_key' : object['Key'], 'object_uri_str' : object_uri_str, 'base_uri' : remote_uri, 'dev' : None, 'inode' : None, } if rem_list[key]['md5'].find("-") != -1: # always get it for multipart uploads _get_remote_attribs(S3Uri(object_uri_str), rem_list[key]) md5 = rem_list[key]['md5'] rem_list.record_md5(key, md5) if break_now: break return rem_list
def _get_filelist_remote(remote_uri, recursive = True): ## If remote_uri ends with '/' then all remote files will have ## the remote_uri prefix removed in the relative path. ## If, on the other hand, the remote_uri ends with something else ## (probably alphanumeric symbol) we'll use the last path part ## in the relative path. ## ## Complicated, eh? See an example: ## _get_filelist_remote("s3://bckt/abc/def") may yield: ## { 'def/file1.jpg' : {}, 'def/xyz/blah.txt' : {} } ## _get_filelist_remote("s3://bckt/abc/def/") will yield: ## { 'file1.jpg' : {}, 'xyz/blah.txt' : {} } ## Furthermore a prefix-magic can restrict the return list: ## _get_filelist_remote("s3://bckt/abc/def/x") yields: ## { 'xyz/blah.txt' : {} } info(u"Retrieving list of remote files for %s ..." % remote_uri) s3 = S3(Config()) response = s3.bucket_list(remote_uri.bucket(), prefix = remote_uri.object(), recursive = recursive) rem_base_original = rem_base = remote_uri.object() remote_uri_original = remote_uri if rem_base != '' and rem_base[-1] != '/': rem_base = rem_base[:rem_base.rfind('/')+1] remote_uri = S3Uri("s3://%s/%s" % (remote_uri.bucket(), rem_base)) rem_base_len = len(rem_base) rem_list = FileDict(ignore_case = False) break_now = False for object in response['list']: if object['Key'] == rem_base_original and object['Key'][-1] != "/": ## We asked for one file and we got that file :-) key = os.path.basename(object['Key']) object_uri_str = remote_uri_original.uri() break_now = True rem_list = FileDict(ignore_case = False) ## Remove whatever has already been put to rem_list else: key = object['Key'][rem_base_len:] ## Beware - this may be '' if object['Key']==rem_base !! object_uri_str = remote_uri.uri() + key rem_list[key] = { 'size' : int(object['Size']), 'timestamp' : dateS3toUnix(object['LastModified']), ## Sadly it's upload time, not our lastmod time :-( 'md5' : object['ETag'][1:-1], 'object_key' : object['Key'], 'object_uri_str' : object_uri_str, 'base_uri' : remote_uri, 'dev' : None, 'inode' : None, } if rem_list[key]['md5'].find("-"): # always get it for multipart uploads _get_remote_attribs(S3Uri(object_uri_str), rem_list[key]) md5 = rem_list[key]['md5'] rem_list.record_md5(key, md5) if break_now: break return rem_list
def fetch_remote_list(args, require_attribs = False, recursive = None): def _get_filelist_remote(remote_uri, recursive = True): ## If remote_uri ends with '/' then all remote files will have ## the remote_uri prefix removed in the relative path. ## If, on the other hand, the remote_uri ends with something else ## (probably alphanumeric symbol) we'll use the last path part ## in the relative path. ## ## Complicated, eh? See an example: ## _get_filelist_remote("s3://bckt/abc/def") may yield: ## { 'def/file1.jpg' : {}, 'def/xyz/blah.txt' : {} } ## _get_filelist_remote("s3://bckt/abc/def/") will yield: ## { 'file1.jpg' : {}, 'xyz/blah.txt' : {} } ## Furthermore a prefix-magic can restrict the return list: ## _get_filelist_remote("s3://bckt/abc/def/x") yields: ## { 'xyz/blah.txt' : {} } info(u"Retrieving list of remote files for %s ..." % remote_uri) s3 = S3(Config()) response = s3.bucket_list(remote_uri.bucket(), prefix = remote_uri.object(), recursive = recursive) rem_base_original = rem_base = remote_uri.object() remote_uri_original = remote_uri if rem_base != '' and rem_base[-1] != '/': rem_base = rem_base[:rem_base.rfind('/')+1] remote_uri = S3Uri("s3://%s/%s" % (remote_uri.bucket(), rem_base)) rem_base_len = len(rem_base) rem_list = FileDict(ignore_case = False) break_now = False for object in response['list']: if object['Key'] == rem_base_original and object['Key'][-1] != "/": ## We asked for one file and we got that file :-) key = os.path.basename(object['Key']) object_uri_str = remote_uri_original.uri() break_now = True rem_list = FileDict(ignore_case = False) ## Remove whatever has already been put to rem_list else: key = object['Key'][rem_base_len:] ## Beware - this may be '' if object['Key']==rem_base !! object_uri_str = remote_uri.uri() + key rem_list[key] = { 'size' : int(object['Size']), 'timestamp' : dateS3toUnix(object['LastModified']), ## Sadly it's upload time, not our lastmod time :-( 'md5' : object['ETag'][1:-1], 'object_key' : object['Key'], 'object_uri_str' : object_uri_str, 'base_uri' : remote_uri, 'dev' : None, 'inode' : None, } md5 = object['ETag'][1:-1] rem_list.record_md5(key, md5) if break_now: break return rem_list cfg = Config() remote_uris = [] remote_list = FileDict(ignore_case = False) if type(args) not in (list, tuple): args = [args] if recursive == None: recursive = cfg.recursive for arg in args: uri = S3Uri(arg) if not uri.type == 's3': raise ParameterError("Expecting S3 URI instead of '%s'" % arg) remote_uris.append(uri) if recursive: for uri in remote_uris: objectlist = _get_filelist_remote(uri) for key in objectlist: remote_list[key] = objectlist[key] remote_list.record_md5(key, objectlist.get_md5(key)) else: for uri in remote_uris: uri_str = str(uri) ## Wildcards used in remote URI? ## If yes we'll need a bucket listing... if uri_str.find('*') > -1 or uri_str.find('?') > -1: first_wildcard = uri_str.find('*') first_questionmark = uri_str.find('?') if first_questionmark > -1 and first_questionmark < first_wildcard: first_wildcard = first_questionmark prefix = uri_str[:first_wildcard] rest = uri_str[first_wildcard+1:] ## Only request recursive listing if the 'rest' of the URI, ## i.e. the part after first wildcard, contains '/' need_recursion = rest.find('/') > -1 objectlist = _get_filelist_remote(S3Uri(prefix), recursive = need_recursion) for key in objectlist: ## Check whether the 'key' matches the requested wildcards if glob.fnmatch.fnmatch(objectlist[key]['object_uri_str'], uri_str): remote_list[key] = objectlist[key] else: ## No wildcards - simply append the given URI to the list key = os.path.basename(uri.object()) if not key: raise ParameterError(u"Expecting S3 URI with a filename or --recursive: %s" % uri.uri()) remote_item = { 'base_uri': uri, 'object_uri_str': unicode(uri), 'object_key': uri.object() } if require_attribs: response = S3(cfg).object_info(uri) remote_item.update({ 'size': int(response['headers']['content-length']), 'md5': response['headers']['etag'].strip('"\''), 'timestamp' : dateRFC822toUnix(response['headers']['date']) }) # get md5 from header if it's present. We would have set that during upload if response['headers'].has_key('x-amz-meta-s3cmd-attrs'): attrs = parse_attrs_header(response['headers']['x-amz-meta-s3cmd-attrs']) if attrs.has_key('md5'): remote_item.update({'md5': attrs['md5']}) remote_list[key] = remote_item return remote_list
def fetch_remote_list(args, require_attribs=False, recursive=None, uri_params={}): def _get_remote_attribs(uri, remote_item): response = S3(cfg).object_info(uri) if not response.get('headers'): return remote_item.update({ 'size': int(response['headers']['content-length']), 'md5': response['headers']['etag'].strip('"\''), 'timestamp': dateRFC822toUnix(response['headers']['last-modified']) }) try: md5 = response['s3cmd-attrs']['md5'] remote_item.update({'md5': md5}) debug(u"retreived md5=%s from headers" % md5) except KeyError: pass def _get_filelist_remote(remote_uri, recursive=True): ## If remote_uri ends with '/' then all remote files will have ## the remote_uri prefix removed in the relative path. ## If, on the other hand, the remote_uri ends with something else ## (probably alphanumeric symbol) we'll use the last path part ## in the relative path. ## ## Complicated, eh? See an example: ## _get_filelist_remote("s3://bckt/abc/def") may yield: ## { 'def/file1.jpg' : {}, 'def/xyz/blah.txt' : {} } ## _get_filelist_remote("s3://bckt/abc/def/") will yield: ## { 'file1.jpg' : {}, 'xyz/blah.txt' : {} } ## Furthermore a prefix-magic can restrict the return list: ## _get_filelist_remote("s3://bckt/abc/def/x") yields: ## { 'xyz/blah.txt' : {} } info(u"Retrieving list of remote files for %s ..." % remote_uri) empty_fname_re = re.compile(r'\A\s*\Z') total_size = 0 s3 = S3(Config()) response = s3.bucket_list(remote_uri.bucket(), prefix=remote_uri.object(), recursive=recursive, uri_params=uri_params) rem_base_original = rem_base = remote_uri.object() remote_uri_original = remote_uri if rem_base != '' and rem_base[-1] != '/': rem_base = rem_base[:rem_base.rfind('/') + 1] remote_uri = S3Uri(u"s3://%s/%s" % (remote_uri.bucket(), rem_base)) rem_base_len = len(rem_base) rem_list = FileDict(ignore_case=False) break_now = False for object in response['list']: if object['Key'] == rem_base_original and object['Key'][-1] != "/": ## We asked for one file and we got that file :-) key = unicodise(os.path.basename(deunicodise(object['Key']))) object_uri_str = remote_uri_original.uri() break_now = True rem_list = FileDict( ignore_case=False ) ## Remove whatever has already been put to rem_list else: key = object['Key'][ rem_base_len:] ## Beware - this may be '' if object['Key']==rem_base !! object_uri_str = remote_uri.uri() + key if empty_fname_re.match(key): # Objects may exist on S3 with empty names (''), which don't map so well to common filesystems. warning(u"Empty object name on S3 found, ignoring.") continue rem_list[key] = { 'size': int(object['Size']), 'timestamp': dateS3toUnix( object['LastModified'] ), ## Sadly it's upload time, not our lastmod time :-( 'md5': object['ETag'].strip('"\''), 'object_key': object['Key'], 'object_uri_str': object_uri_str, 'base_uri': remote_uri, 'dev': None, 'inode': None, } if '-' in rem_list[key][ 'md5']: # always get it for multipart uploads _get_remote_attribs(S3Uri(object_uri_str), rem_list[key]) md5 = rem_list[key]['md5'] rem_list.record_md5(key, md5) total_size += int(object['Size']) if break_now: break return rem_list, total_size cfg = Config() remote_uris = [] remote_list = FileDict(ignore_case=False) if type(args) not in (list, tuple, set): args = [args] if recursive == None: recursive = cfg.recursive for arg in args: uri = S3Uri(arg) if not uri.type == 's3': raise ParameterError("Expecting S3 URI instead of '%s'" % arg) remote_uris.append(uri) total_size = 0 if recursive: for uri in remote_uris: objectlist, tmp_total_size = _get_filelist_remote(uri, recursive=True) total_size += tmp_total_size for key in objectlist: remote_list[key] = objectlist[key] remote_list.record_md5(key, objectlist.get_md5(key)) else: for uri in remote_uris: uri_str = uri.uri() ## Wildcards used in remote URI? ## If yes we'll need a bucket listing... wildcard_split_result = re.split("\*|\?", uri_str, maxsplit=1) if len(wildcard_split_result) == 2: # wildcards found prefix, rest = wildcard_split_result ## Only request recursive listing if the 'rest' of the URI, ## i.e. the part after first wildcard, contains '/' need_recursion = '/' in rest objectlist, tmp_total_size = _get_filelist_remote( S3Uri(prefix), recursive=need_recursion) total_size += tmp_total_size for key in objectlist: ## Check whether the 'key' matches the requested wildcards if glob.fnmatch.fnmatch(objectlist[key]['object_uri_str'], uri_str): remote_list[key] = objectlist[key] else: ## No wildcards - simply append the given URI to the list key = unicodise(os.path.basename(deunicodise(uri.object()))) if not key: raise ParameterError( u"Expecting S3 URI with a filename or --recursive: %s" % uri.uri()) remote_item = { 'base_uri': uri, 'object_uri_str': uri.uri(), 'object_key': uri.object() } if require_attribs: _get_remote_attribs(uri, remote_item) remote_list[key] = remote_item md5 = remote_item.get('md5') if md5: remote_list.record_md5(key, md5) total_size += remote_item.get('size', 0) remote_list, exclude_list = filter_exclude_include(remote_list) return remote_list, exclude_list, total_size
def fetch_remote_list(args, require_attribs = False, recursive = None, uri_params = {}): def _get_remote_attribs(uri, remote_item): response = S3(cfg).object_info(uri) remote_item.update({ 'size': int(response['headers']['content-length']), 'md5': response['headers']['etag'].strip('"\''), 'timestamp' : dateRFC822toUnix(response['headers']['date']) }) try: md5 = response['s3cmd-attrs']['md5'] remote_item.update({'md5': md5}) debug(u"retreived md5=%s from headers" % md5) except KeyError: pass def _get_filelist_remote(remote_uri, recursive = True): ## If remote_uri ends with '/' then all remote files will have ## the remote_uri prefix removed in the relative path. ## If, on the other hand, the remote_uri ends with something else ## (probably alphanumeric symbol) we'll use the last path part ## in the relative path. ## ## Complicated, eh? See an example: ## _get_filelist_remote("s3://bckt/abc/def") may yield: ## { 'def/file1.jpg' : {}, 'def/xyz/blah.txt' : {} } ## _get_filelist_remote("s3://bckt/abc/def/") will yield: ## { 'file1.jpg' : {}, 'xyz/blah.txt' : {} } ## Furthermore a prefix-magic can restrict the return list: ## _get_filelist_remote("s3://bckt/abc/def/x") yields: ## { 'xyz/blah.txt' : {} } info(u"Retrieving list of remote files for %s ..." % remote_uri) empty_fname_re = re.compile(r'\A\s*\Z') s3 = S3(Config()) response = s3.bucket_list(remote_uri.bucket(), prefix = remote_uri.object(), recursive = recursive, uri_params = uri_params) rem_base_original = rem_base = remote_uri.object() remote_uri_original = remote_uri if rem_base != '' and rem_base[-1] != '/': rem_base = rem_base[:rem_base.rfind('/')+1] remote_uri = S3Uri("s3://%s/%s" % (remote_uri.bucket(), rem_base)) rem_base_len = len(rem_base) rem_list = FileDict(ignore_case = False) break_now = False for object in response['list']: if object['Key'] == rem_base_original and object['Key'][-1] != "/": ## We asked for one file and we got that file :-) key = os.path.basename(object['Key']) object_uri_str = remote_uri_original.uri() break_now = True rem_list = FileDict(ignore_case = False) ## Remove whatever has already been put to rem_list else: key = object['Key'][rem_base_len:] ## Beware - this may be '' if object['Key']==rem_base !! object_uri_str = remote_uri.uri() + key if empty_fname_re.match(key): # Objects may exist on S3 with empty names (''), which don't map so well to common filesystems. warning(u"Empty object name on S3 found, ignoring.") continue rem_list[key] = { 'size' : int(object['Size']), 'timestamp' : dateS3toUnix(object['LastModified']), ## Sadly it's upload time, not our lastmod time :-( 'md5' : object['ETag'][1:-1], 'object_key' : object['Key'], 'object_uri_str' : object_uri_str, 'base_uri' : remote_uri, 'dev' : None, 'inode' : None, } if rem_list[key]['md5'].find("-") > 0: # always get it for multipart uploads _get_remote_attribs(S3Uri(object_uri_str), rem_list[key]) md5 = rem_list[key]['md5'] rem_list.record_md5(key, md5) if break_now: break return rem_list cfg = Config() remote_uris = [] remote_list = FileDict(ignore_case = False) if type(args) not in (list, tuple): args = [args] if recursive == None: recursive = cfg.recursive for arg in args: uri = S3Uri(arg) if not uri.type == 's3': raise ParameterError("Expecting S3 URI instead of '%s'" % arg) remote_uris.append(uri) if recursive: for uri in remote_uris: objectlist = _get_filelist_remote(uri, recursive = True) for key in objectlist: remote_list[key] = objectlist[key] remote_list.record_md5(key, objectlist.get_md5(key)) else: for uri in remote_uris: uri_str = unicode(uri) ## Wildcards used in remote URI? ## If yes we'll need a bucket listing... wildcard_split_result = re.split("\*|\?", uri_str, maxsplit=1) if len(wildcard_split_result) == 2: # wildcards found prefix, rest = wildcard_split_result ## Only request recursive listing if the 'rest' of the URI, ## i.e. the part after first wildcard, contains '/' need_recursion = '/' in rest objectlist = _get_filelist_remote(S3Uri(prefix), recursive = need_recursion) for key in objectlist: ## Check whether the 'key' matches the requested wildcards if glob.fnmatch.fnmatch(objectlist[key]['object_uri_str'], uri_str): remote_list[key] = objectlist[key] else: ## No wildcards - simply append the given URI to the list key = os.path.basename(uri.object()) if not key: raise ParameterError(u"Expecting S3 URI with a filename or --recursive: %s" % uri.uri()) remote_item = { 'base_uri': uri, 'object_uri_str': unicode(uri), 'object_key': uri.object() } if require_attribs: _get_remote_attribs(uri, remote_item) remote_list[key] = remote_item md5 = remote_item.get('md5') if md5: remote_list.record_md5(key, md5) remote_list, exclude_list = filter_exclude_include(remote_list) return remote_list, exclude_list