def testPluralityCheckableIteratorWith1Elem1Exception(self): """Tests PluralityCheckableIterator with 2 elements. The second element raises an exception. """ class IterTest(object): def __init__(self): self.position = 0 def __iter__(self): return self def next(self): if self.position == 0: self.position += 1 return 1 elif self.position == 1: self.position += 1 raise CustomTestException('Test exception') else: raise StopIteration() pcit = PluralityCheckableIterator(IterTest()) self.assertFalse(pcit.IsEmpty()) self.assertTrue(pcit.HasPlurality()) iterated_value = None try: for value in pcit: iterated_value = value self.fail('Expected exception from iterator') except CustomTestException: pass self.assertEqual(iterated_value, 1)
def _RecursePrint(self, blr): """ Expands a bucket listing reference and recurses to its children, calling _PrintInfoAboutBucketListingRef for each expanded object found. Args: blr: An instance of BucketListingRef. Returns: Tuple containing (number of object, total number of bytes) """ num_bytes = 0 num_objs = 0 if blr.HasKey(): blr_iterator = iter([blr]) elif blr.HasPrefix(): blr_iterator = self.WildcardIterator( '%s/*' % blr.GetRStrippedUriString(), all_versions=self.all_versions) elif blr.NamesBucket(): blr_iterator = self.WildcardIterator( '%s*' % blr.GetUriString(), all_versions=self.all_versions) else: # This BLR didn't come from a bucket listing. This case happens for # BLR's instantiated from a user-provided URI. blr_iterator = PluralityCheckableIterator( UriOnlyBlrExpansionIterator( self, blr, all_versions=self.all_versions)) if blr_iterator.is_empty() and not ContainsWildcard(blr.GetUriString()): raise CommandException('No such object %s' % blr.GetUriString()) for cur_blr in blr_iterator: if self.exclude_patterns: tomatch = cur_blr.GetUriString() skip = False for pattern in self.exclude_patterns: if fnmatch.fnmatch(tomatch, pattern): skip = True break if skip: continue if cur_blr.HasKey(): # Object listing. no, nb = self._PrintInfoAboutBucketListingRef(cur_blr) else: # Subdir listing. if cur_blr.GetUriString().endswith('//'): # Expand gs://bucket// into gs://bucket//* so we don't infinite # loop. This case happens when user has uploaded an object whose # name begins with a /. cur_blr = BucketListingRef(self.suri_builder.StorageUri( '%s*' % cur_blr.GetUriString()), None, None, cur_blr.headers) no, nb = self._RecursePrint(cur_blr) num_bytes += nb num_objs += no if blr.HasPrefix() and not self.summary_only: self._PrintSummaryLine(num_bytes, blr.GetUriString().encode('utf-8')) return num_objs, num_bytes
def _GetIam(self, thread_state=None): """Gets IAM policy for single bucket or object.""" pattern = self.args[0] matches = PluralityCheckableIterator( self.WildcardIterator(pattern).IterAll( bucket_listing_fields=['name'])) if matches.IsEmpty(): raise CommandException('%s matched no URLs' % pattern) if matches.HasPlurality(): raise CommandException( '%s matched more than one URL, which is not allowed by the %s ' 'command' % (pattern, self.command_name)) storage_url = StorageUrlFromString(list(matches)[0].url_string) policy = self.GetIamHelper(storage_url, thread_state=thread_state) policy_json = json.loads(protojson.encode_message(policy)) policy_str = json.dumps( policy_json, sort_keys=True, separators=(',', ': '), indent=2, ) print(policy_str)
def testPluralityCheckableIteratorWith0Elems(self): """Tests empty PluralityCheckableIterator.""" input_list = list(range(0)) it = iter(input_list) pcit = PluralityCheckableIterator(it) self.assertTrue(pcit.IsEmpty()) self.assertFalse(pcit.HasPlurality()) output_list = list(pcit) self.assertEqual(input_list, output_list)
def testPluralityCheckableIteratorWith3Elems(self): """Tests PluralityCheckableIterator with 3 elements.""" input_list = range(3) it = iter(input_list) pcit = PluralityCheckableIterator(it) self.assertFalse(pcit.is_empty()) self.assertTrue(pcit.has_plurality()) output_list = list(pcit) self.assertEqual(input_list, output_list)
def NameExpansionIterator(command_name, debug, logger, gsutil_api, url_strs, recursion_requested, all_versions=False, cmd_supports_recursion=True, project_id=None, continue_on_error=False): """Static factory function for instantiating _NameExpansionIterator. This wraps the resulting iterator in a PluralityCheckableIterator and checks that it is non-empty. Also, allows url_strs to be either an array or an iterator. Args: command_name: name of command being run. debug: Debug level to pass to underlying iterators (range 0..3). logger: logging.Logger object. gsutil_api: Cloud storage interface. Settable for testing/mocking. url_strs: Iterable URL strings needing expansion. recursion_requested: True if -r specified on command-line. If so, listings will be flattened so mapped-to results contain objects spanning subdirectories. all_versions: Bool indicating whether to iterate over all object versions. cmd_supports_recursion: Bool indicating whether this command supports a '-r' flag. Useful for printing helpful error messages. project_id: Project id to use for the current command. continue_on_error: If true, yield no-match exceptions encountered during iteration instead of raising them. Raises: CommandException if underlying iterator is empty. Returns: Name expansion iterator instance. For example semantics, see comments in NameExpansionIterator.__init__. """ url_strs = PluralityCheckableIterator(url_strs) name_expansion_iterator = _NameExpansionIterator( command_name, debug, logger, gsutil_api, url_strs, recursion_requested, all_versions=all_versions, cmd_supports_recursion=cmd_supports_recursion, project_id=project_id, continue_on_error=continue_on_error) name_expansion_iterator = PluralityCheckableIterator( name_expansion_iterator) if name_expansion_iterator.IsEmpty(): raise CommandException('No URLs matched') return name_expansion_iterator
def testPluralityCheckableIteratorWith2Elems(self): """Tests PluralityCheckableIterator with 2 elements.""" input_list = range(2) it = iter(input_list) pcit = PluralityCheckableIterator(it) self.assertFalse(pcit.is_empty()) self.assertTrue(pcit.has_plurality()) output_list = list(pcit) self.assertEqual(input_list, output_list)
def testPluralityCheckableIteratorWith3Elems(self): """Tests PluralityCheckableIterator with 3 elements.""" input_list = range(3) it = iter(input_list) pcit = PluralityCheckableIterator(it) self.assertFalse(pcit.IsEmpty()) self.assertTrue(pcit.HasPlurality()) output_list = list(pcit) self.assertEqual(input_list, output_list)
def ExpandUrlAndPrint(self, url): """Iterates over the given URL and calls print functions. Args: url: StorageUrl to iterate over. Returns: (num_objects, num_bytes) total number of objects and bytes iterated. """ num_objects = 0 num_dirs = 0 num_bytes = 0 print_newline = False if url.IsBucket() or self.should_recurse: # IsBucket() implies a top-level listing. if url.IsBucket(): self._print_bucket_header_func(url) return self._RecurseExpandUrlAndPrint(url.url_string, print_initial_newline=False) else: # User provided a prefix or object URL, but it's impossible to tell # which until we do a listing and see what matches. top_level_iterator = PluralityCheckableIterator(self._iterator_func( url.CreatePrefixUrl(wildcard_suffix=None), all_versions=self.all_versions).IterAll( expand_top_level_buckets=True, bucket_listing_fields=self.bucket_listing_fields)) plurality = top_level_iterator.HasPlurality() for blr in top_level_iterator: if self._MatchesExcludedPattern(blr): continue if blr.IsObject(): nd = 0 no, nb = self._print_object_func(blr) print_newline = True elif blr.IsPrefix(): if print_newline: self._print_newline_func() else: print_newline = True if plurality: self._print_dir_header_func(blr) expansion_url_str = StorageUrlFromString( blr.url_string).CreatePrefixUrl(wildcard_suffix='*') nd, no, nb = self._RecurseExpandUrlAndPrint(expansion_url_str) self._print_dir_summary_func(nb, blr) else: # We handle all buckets at the top level, so this should never happen. raise CommandException( 'Sub-level iterator returned a CsBucketListingRef of type Bucket') num_objects += no num_dirs += nd num_bytes += nb return num_dirs, num_objects, num_bytes
def __init__(self, command_obj, base_src_url, base_dst_url): self.command_obj = command_obj self.compute_file_checksums = command_obj.compute_file_checksums self.delete_extras = command_obj.delete_extras self.recursion_requested = command_obj.recursion_requested self.logger = self.command_obj.logger self.base_src_url = base_src_url self.base_dst_url = base_dst_url self.logger.info('Building synchronization state...') (src_fh, self.sorted_list_src_file_name) = tempfile.mkstemp( prefix='gsutil-rsync-src-') _tmp_files.append(self.sorted_list_src_file_name) (dst_fh, self.sorted_list_dst_file_name) = tempfile.mkstemp( prefix='gsutil-rsync-dst-') _tmp_files.append(self.sorted_list_dst_file_name) # Close the file handles; the file will be opened in write mode by # _ListUrlRootFunc. os.close(src_fh) os.close(dst_fh) # Build sorted lists of src and dst URLs in parallel. To do this, pass args # to _ListUrlRootFunc as tuple (base_url_str, out_filename, desc) # where base_url_str is the starting URL string for listing. args_iter = iter([(self.base_src_url.url_string, self.sorted_list_src_file_name, 'source'), (self.base_dst_url.url_string, self.sorted_list_dst_file_name, 'destination')]) # Contains error message from non-retryable listing failure. command_obj.non_retryable_listing_failures = 0 shared_attrs = ['non_retryable_listing_failures'] command_obj.Apply(_ListUrlRootFunc, args_iter, _RootListingExceptionHandler, shared_attrs, arg_checker=DummyArgChecker, parallel_operations_override=True, fail_on_error=True) if command_obj.non_retryable_listing_failures: raise CommandException( 'Caught non-retryable exception - aborting rsync') self.sorted_list_src_file = open(self.sorted_list_src_file_name, 'r') self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, 'r') # Wrap iterators in PluralityCheckableIterator so we can check emptiness. self.sorted_src_urls_it = PluralityCheckableIterator( iter(self.sorted_list_src_file)) self.sorted_dst_urls_it = PluralityCheckableIterator( iter(self.sorted_list_dst_file))
def __init__(self, command_obj, base_src_url, base_dst_url): self.command_obj = command_obj self.compute_file_checksums = command_obj.compute_file_checksums self.delete_extras = command_obj.delete_extras self.recursion_requested = command_obj.recursion_requested self.logger = self.command_obj.logger self.base_src_url = base_src_url self.base_dst_url = base_dst_url self.logger.info("Building synchronization state...") (src_fh, self.sorted_list_src_file_name) = tempfile.mkstemp(prefix="gsutil-rsync-src-") _tmp_files.append(self.sorted_list_src_file_name) (dst_fh, self.sorted_list_dst_file_name) = tempfile.mkstemp(prefix="gsutil-rsync-dst-") _tmp_files.append(self.sorted_list_dst_file_name) # Close the file handles; the file will be opened in write mode by # _ListUrlRootFunc. os.close(src_fh) os.close(dst_fh) # Build sorted lists of src and dst URLs in parallel. To do this, pass args # to _ListUrlRootFunc as tuple (base_url_str, out_filename, desc) # where base_url_str is the starting URL string for listing. args_iter = iter( [ (self.base_src_url.url_string, self.sorted_list_src_file_name, "source"), (self.base_dst_url.url_string, self.sorted_list_dst_file_name, "destination"), ] ) # Contains error message from non-retryable listing failure. command_obj.non_retryable_listing_failures = 0 shared_attrs = ["non_retryable_listing_failures"] command_obj.Apply( _ListUrlRootFunc, args_iter, _RootListingExceptionHandler, shared_attrs, arg_checker=DummyArgChecker, parallel_operations_override=True, fail_on_error=True, ) if command_obj.non_retryable_listing_failures: raise CommandException("Caught non-retryable exception - aborting rsync") self.sorted_list_src_file = open(self.sorted_list_src_file_name, "r") self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, "r") # Wrap iterators in PluralityCheckableIterator so we can check emptiness. self.sorted_src_urls_it = PluralityCheckableIterator(iter(self.sorted_list_src_file)) self.sorted_dst_urls_it = PluralityCheckableIterator(iter(self.sorted_list_dst_file))
def __init__(self, command_obj, base_src_url, base_dst_url): self.command_obj = command_obj self.compute_checksums = command_obj.compute_checksums self.delete_extras = command_obj.delete_extras self.recursion_requested = command_obj.recursion_requested self.logger = self.command_obj.logger self.base_src_url = base_src_url self.base_dst_url = base_dst_url self.logger.info('Building synchronization state...') (src_fh, self.sorted_list_src_file_name) = tempfile.mkstemp( prefix='gsutil-rsync-src-') (dst_fh, self.sorted_list_dst_file_name) = tempfile.mkstemp( prefix='gsutil-rsync-dst-') # Close the file handles; the file will be opened in write mode by # _ListUrlRootFunc. os.close(src_fh) os.close(dst_fh) # Build sorted lists of src and dst URLs in parallel. To do this, pass args # to _ListUrlRootFunc as tuple (url_str, out_file_name, desc). args_iter = iter([ (self.base_src_url.GetUrlString(), self.sorted_list_src_file_name, 'source'), (self.base_dst_url.GetUrlString(), self.sorted_list_dst_file_name, 'destination') ]) if IS_WINDOWS: # Don't use multi-processing on Windows (very broken). thread_count = 2 process_count = 1 else: # Otherwise use multi-processing, to avoid Python global thread lock # contention. thread_count = 1 process_count = 2 command_obj.Apply(_ListUrlRootFunc, args_iter, _RootListingExceptionHandler, arg_checker=DummyArgChecker, parallel_operations_override=True, thread_count=thread_count, process_count=process_count, fail_on_error=True) self.sorted_list_src_file = open(self.sorted_list_src_file_name, 'r') self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, 'r') # Wrap iterators in PluralityCheckableIterator so we can check emptiness. self.sorted_src_urls_it = PluralityCheckableIterator( iter(self.sorted_list_src_file)) self.sorted_dst_urls_it = PluralityCheckableIterator( iter(self.sorted_list_dst_file))
def _GetIam(self, pattern, thread_state=None): """Gets IAM policy for single bucket or object.""" matches = PluralityCheckableIterator( self.WildcardIterator(pattern).IterAll( bucket_listing_fields=['name'])) if matches.IsEmpty(): raise CommandException('%s matched no URLs' % pattern) if matches.HasPlurality(): raise CommandException( '%s matched more than one URL, which is not allowed by the %s ' 'command' % (pattern, self.command_name)) storage_url = StorageUrlFromString(list(matches)[0].url_string) return self.GetIamHelper(storage_url, thread_state=thread_state)
def __init__(self, command_name, debug, gsutil_api, url_strs, recursion_requested, all_versions=False, cmd_supports_recursion=True, project_id=None): """Initializes a _NameExpansionIterator with the inputs.""" # Count data bytes only will be transferred/rewritten. # Note that the rsync command uses a different iterator, thus it is not # included here. self.count_data_bytes = command_name in ('cp', 'mv', 'rewrite') # Only query the file size if we are counting data bytes, as this may # result in stat'ing files, which is more expensive. bucket_listing_fields = ['size'] if self.count_data_bytes else None self.name_expansion_iterator = _NameExpansionIterator( command_name, debug, logging.getLogger('dummy'), gsutil_api, PluralityCheckableIterator(url_strs), recursion_requested, all_versions=all_versions, cmd_supports_recursion=cmd_supports_recursion, project_id=project_id, continue_on_error=True, bucket_listing_fields=bucket_listing_fields)
def __iter__(self): for blr in self.blr_iter: uri = blr.GetUri() if uri.names_object(): # URI could be a bucket subdir. implicit_subdir_iterator = PluralityCheckableIterator( self.name_expansion_instance._WildcardIterator( self.name_expansion_instance.suri_builder.StorageUri( '%s/%s' % (uri.uri.rstrip('/'), self.name_expansion_instance. _flatness_wildcard[self.flat])))) if not implicit_subdir_iterator.is_empty(): for exp_blr in implicit_subdir_iterator: yield (True, exp_blr) else: yield (False, blr) else: yield (False, blr)
def NameExpansionIterator(command_name, proj_id_handler, headers, debug, bucket_storage_uri_class, uri_strs, recursion_requested, have_existing_dst_container=None, flat=True): """ Static factory function for instantiating _NameExpansionIterator, which wraps the resulting iterator in a PluralityCheckableIterator and checks that it is non-empty. Args are as documented in constructor for _NameExpansionIterator class. """ name_expansion_iterator = _NameExpansionIterator( command_name, proj_id_handler, headers, debug, bucket_storage_uri_class, uri_strs, recursion_requested, have_existing_dst_container, flat) name_expansion_iterator = PluralityCheckableIterator(name_expansion_iterator) if name_expansion_iterator.is_empty(): raise CommandException('No URIs matched') return name_expansion_iterator
def __iter__(self): for blr in self.blr_iter: uri = blr.GetUri() if uri.names_object(): # URI could be a bucket subdir. implicit_subdir_iterator = PluralityCheckableIterator( self.name_expansion_instance._WildcardIterator( self.name_expansion_instance.suri_builder.StorageUri( '%s/%s' % (uri.uri.rstrip('/'), self.name_expansion_instance._flatness_wildcard[ self.flat])))) if not implicit_subdir_iterator.is_empty(): for exp_blr in implicit_subdir_iterator: yield (True, exp_blr) else: yield (False, blr) else: yield (False, blr)
def testPluralityCheckableIteratorWith2Exceptions(self): """Tests PluralityCheckableIterator with 2 elements that both raise.""" class IterTest(six.Iterator): def __init__(self): self.position = 0 def __iter__(self): return self def __next__(self): if self.position < 2: self.position += 1 raise CustomTestException('Test exception %s' % self.position) else: raise StopIteration() pcit = PluralityCheckableIterator(IterTest()) try: pcit.PeekException() self.fail('Expected exception 1 from PeekException') except CustomTestException as e: self.assertIn(str(e), 'Test exception 1') try: for _ in pcit: pass self.fail('Expected exception 1 from iterator') except CustomTestException as e: self.assertIn(str(e), 'Test exception 1') try: pcit.PeekException() self.fail('Expected exception 2 from PeekException') except CustomTestException as e: self.assertIn(str(e), 'Test exception 2') try: for _ in pcit: pass self.fail('Expected exception 2 from iterator') except CustomTestException as e: self.assertIn(str(e), 'Test exception 2') for _ in pcit: self.fail('Expected StopIteration')
def __iter__(self): for blr in self.blr_iter: if blr.IsPrefix(): # This is a bucket subdirectory, list objects according to the wildcard. prefix_url = StorageUrlFromString(blr.url_string).CreatePrefixUrl( wildcard_suffix=self.subdir_exp_wildcard) implicit_subdir_iterator = PluralityCheckableIterator( self.name_exp_instance.WildcardIterator(prefix_url).IterAll( bucket_listing_fields=self.bucket_listing_fields)) if not implicit_subdir_iterator.IsEmpty(): for exp_blr in implicit_subdir_iterator: yield (True, exp_blr) else: # Prefix that contains no objects, for example in the $folder$ case # or an empty filesystem directory. yield (False, blr) elif blr.IsObject(): yield (False, blr) else: raise CommandException( '_ImplicitBucketSubdirIterator got a bucket reference %s' % blr)
def __init__(self, command_obj, base_src_url, base_dst_url): self.command_obj = command_obj self.compute_checksums = command_obj.compute_checksums self.delete_extras = command_obj.delete_extras self.recursion_requested = command_obj.recursion_requested self.logger = self.command_obj.logger self.base_src_url = base_src_url self.base_dst_url = base_dst_url self.logger.info('Building synchronization state...') (src_fh, self.sorted_list_src_file_name) = tempfile.mkstemp( prefix='gsutil-rsync-src-') (dst_fh, self.sorted_list_dst_file_name) = tempfile.mkstemp( prefix='gsutil-rsync-dst-') # Close the file handles; the file will be opened in write mode by # _ListUrlRootFunc. os.close(src_fh) os.close(dst_fh) # Build sorted lists of src and dst URLs in parallel. To do this, pass args # to _ListUrlRootFunc as tuple (url_str, out_file_name, desc). args_iter = iter([ (self.base_src_url.GetUrlString(), self.sorted_list_src_file_name, 'source'), (self.base_dst_url.GetUrlString(), self.sorted_list_dst_file_name, 'destination') ]) command_obj.Apply(_ListUrlRootFunc, args_iter, _RootListingExceptionHandler, arg_checker=DummyArgChecker, parallel_operations_override=True, fail_on_error=True) self.sorted_list_src_file = open(self.sorted_list_src_file_name, 'rb') self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, 'rb') # Wrap iterators in PluralityCheckableIterator so we can check emptiness. self.sorted_src_urls_it = PluralityCheckableIterator( iter(self.sorted_list_src_file)) self.sorted_dst_urls_it = PluralityCheckableIterator( iter(self.sorted_list_dst_file))
def __init__(self, command_obj, base_src_url, base_dst_url): self.command_obj = command_obj self.compute_checksums = command_obj.compute_checksums self.delete_extras = command_obj.delete_extras self.recursion_requested = command_obj.recursion_requested self.logger = self.command_obj.logger self.base_src_url = base_src_url self.base_dst_url = base_dst_url self.logger.info('Building synchronization state...') (src_fh, self.sorted_list_src_file_name) = tempfile.mkstemp( prefix='gsutil-rsync-src-') (dst_fh, self.sorted_list_dst_file_name) = tempfile.mkstemp( prefix='gsutil-rsync-dst-') # Close the file handles; the file will be opened in write mode by # _ListUrlRootFunc. os.close(src_fh) os.close(dst_fh) # Build sorted lists of src and dst URLs in parallel. To do this, pass args # to _ListUrlRootFunc as tuple (url_str, out_file_name, desc). args_iter = iter([ (self.base_src_url.url_string, self.sorted_list_src_file_name, 'source'), (self.base_dst_url.url_string, self.sorted_list_dst_file_name, 'destination') ]) command_obj.Apply(_ListUrlRootFunc, args_iter, _RootListingExceptionHandler, arg_checker=DummyArgChecker, parallel_operations_override=True, fail_on_error=True) self.sorted_list_src_file = open(self.sorted_list_src_file_name, 'r') self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, 'r') # Wrap iterators in PluralityCheckableIterator so we can check emptiness. self.sorted_src_urls_it = PluralityCheckableIterator( iter(self.sorted_list_src_file)) self.sorted_dst_urls_it = PluralityCheckableIterator( iter(self.sorted_list_dst_file))
def testPluralityCheckableIteratorWith2Exceptions(self): """Tests PluralityCheckableIterator with 2 elements that both raise.""" class IterTest(object): def __init__(self): self.position = 0 def __iter__(self): return self def next(self): if self.position < 2: self.position += 1 raise CustomTestException('Test exception %s' % self.position) else: raise StopIteration() pcit = PluralityCheckableIterator(IterTest()) try: pcit.PeekException() self.fail('Expected exception 1 from PeekException') except CustomTestException, e: self.assertIn(e.message, 'Test exception 1')
def testPluralityCheckableIteratorReadsAheadAsNeeded(self): """Tests that the PCI does not unnecessarily read new elements.""" class IterTest(six.Iterator): def __init__(self): self.position = 0 def __iter__(self): return self def __next__(self): if self.position == 3: raise StopIteration() self.position += 1 # IsEmpty and PeekException should retrieve only 1 element from the # underlying iterator. pcit = PluralityCheckableIterator(IterTest()) pcit.IsEmpty() pcit.PeekException() self.assertEquals(pcit.orig_iterator.position, 1) # HasPlurality requires populating 2 elements into the iterator. pcit.HasPlurality() self.assertEquals(pcit.orig_iterator.position, 2) # next should yield already-populated elements without advancing the # iterator. next(pcit) # Yields element 1 self.assertEquals(pcit.orig_iterator.position, 2) next(pcit) # Yields element 2 self.assertEquals(pcit.orig_iterator.position, 2) next(pcit) # Yields element 3 self.assertEquals(pcit.orig_iterator.position, 3) try: next(pcit) # Underlying iterator is empty self.fail('Expected StopIteration') except StopIteration: pass
def testPluralityCheckableIteratorReadsAheadAsNeeded(self): """Tests that the PCI does not unnecessarily read new elements.""" class IterTest(object): def __init__(self): self.position = 0 def __iter__(self): return self def next(self): if self.position == 3: raise StopIteration() self.position += 1 # IsEmpty and PeekException should retrieve only 1 element from the # underlying iterator. pcit = PluralityCheckableIterator(IterTest()) pcit.IsEmpty() pcit.PeekException() self.assertEquals(pcit.orig_iterator.position, 1) # HasPlurality requires populating 2 elements into the iterator. pcit.HasPlurality() self.assertEquals(pcit.orig_iterator.position, 2) # next should yield already-populated elements without advancing the # iterator. pcit.next() # Yields element 1 self.assertEquals(pcit.orig_iterator.position, 2) pcit.next() # Yields element 2 self.assertEquals(pcit.orig_iterator.position, 2) pcit.next() # Yields element 3 self.assertEquals(pcit.orig_iterator.position, 3) try: pcit.next() # Underlying iterator is empty self.fail('Expected StopIteration') except StopIteration: pass
def testPluralityCheckableIteratorWithYieldedException(self): """Tests PCI with an iterator that yields an exception. The yielded exception is in the form of a tuple and must also contain a stack trace. """ class IterTest(six.Iterator): def __init__(self): self.position = 0 def __iter__(self): return self def __next__(self): if self.position == 0: try: self.position += 1 raise CustomTestException('Test exception 0') except CustomTestException as e: return (e, sys.exc_info()[2]) elif self.position == 1: self.position += 1 return 1 else: raise StopIteration() pcit = PluralityCheckableIterator(IterTest()) iterated_value = None try: for _ in pcit: pass self.fail('Expected exception 0 from iterator') except CustomTestException as e: self.assertIn(str(e), 'Test exception 0') for value in pcit: iterated_value = value self.assertEqual(iterated_value, 1)
class _DiffIterator(object): """Iterator yielding sequence of _DiffToApply objects.""" def __init__(self, command_obj, base_src_url, base_dst_url): self.command_obj = command_obj self.compute_checksums = command_obj.compute_checksums self.delete_extras = command_obj.delete_extras self.recursion_requested = command_obj.recursion_requested self.logger = self.command_obj.logger self.base_src_url = base_src_url self.base_dst_url = base_dst_url self.logger.info('Building synchronization state...') (src_fh, self.sorted_list_src_file_name) = tempfile.mkstemp( prefix='gsutil-rsync-src-') (dst_fh, self.sorted_list_dst_file_name) = tempfile.mkstemp( prefix='gsutil-rsync-dst-') # Close the file handles; the file will be opened in write mode by # _ListUrlRootFunc. os.close(src_fh) os.close(dst_fh) # Build sorted lists of src and dst URLs in parallel. To do this, pass args # to _ListUrlRootFunc as tuple (url_str, out_file_name, desc). args_iter = iter([ (self.base_src_url.GetUrlString(), self.sorted_list_src_file_name, 'source'), (self.base_dst_url.GetUrlString(), self.sorted_list_dst_file_name, 'destination') ]) command_obj.Apply(_ListUrlRootFunc, args_iter, _RootListingExceptionHandler, arg_checker=DummyArgChecker, parallel_operations_override=True, fail_on_error=True) self.sorted_list_src_file = open(self.sorted_list_src_file_name, 'rb') self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, 'rb') # Wrap iterators in PluralityCheckableIterator so we can check emptiness. self.sorted_src_urls_it = PluralityCheckableIterator( iter(self.sorted_list_src_file)) self.sorted_dst_urls_it = PluralityCheckableIterator( iter(self.sorted_list_dst_file)) # pylint: disable=bare-except def CleanUpTempFiles(self): """Cleans up temp files. This function allows the main (RunCommand) function to clean up at end of operation. This is necessary because tempfile.NamedTemporaryFile doesn't allow the created file to be re-opened in read mode on Windows, so we have to use tempfile.mkstemp, which doesn't automatically delete temp files (see https://mail.python.org/pipermail/python-list/2005-December/336958.html). """ try: self.sorted_list_src_file.close() self.sorted_list_dst_file.close() for fname in (self.sorted_list_src_file_name, self.sorted_list_dst_file_name): os.unlink(fname) except: pass def _ParseTmpFileLine(self, line): """Parses output from _BuildTmpOutputLine. Parses into tuple: (URL, size, crc32c, md5) where crc32c and/or md5 can be _NA. Args: line: The line to parse. Returns: Parsed tuple: (url, size, crc32c, md5) """ (encoded_url, size, crc32c, md5) = line.split() return (urllib.unquote_plus(encoded_url).decode(UTF8), int(size), crc32c, md5.strip()) def _WarnIfMissingCloudHash(self, url_str, crc32c, md5): """Warns if given url_str is a cloud URL and is missing both crc32c and md5. Args: url_str: Destination URL string. crc32c: Destination CRC32c. md5: Destination MD5. Returns: True if issued warning. """ # One known way this can currently happen is when rsync'ing objects larger # than 5GB from S3 (for which the etag is not an MD5). if (StorageUrlFromString(url_str).IsCloudUrl() and crc32c == _NA and md5 == _NA): self.logger.warn( 'Found no hashes to validate %s. ' 'Integrity cannot be assured without hashes.' % url_str) return True return False def _ObjectsMatch(self, src_url_str, src_size, src_crc32c, src_md5, dst_url_str, dst_size, dst_crc32c, dst_md5): """Returns True if src and dst objects are the same. Uses size plus whatever checksums are available. Args: src_url_str: Source URL string. src_size: Source size src_crc32c: Source CRC32c. src_md5: Source MD5. dst_url_str: Destination URL string. dst_size: Destination size dst_crc32c: Destination CRC32c. dst_md5: Destination MD5. Returns: True/False. """ # Note: This function is called from __iter__, which is called from the # Command.Apply driver. Thus, all checksum computation will be run in a # single thread, which is good (having multiple threads concurrently # computing checksums would thrash the disk). if src_size != dst_size: return False if self.compute_checksums: (src_crc32c, src_md5, dst_crc32c, dst_md5) = _ComputeNeededFileChecksums( self.logger, src_url_str, src_size, src_crc32c, src_md5, dst_url_str, dst_size, dst_crc32c, dst_md5) if src_md5 != _NA and dst_md5 != _NA: self.logger.debug('Comparing md5 for %s and %s', src_url_str, dst_url_str) return src_md5 == dst_md5 if src_crc32c != _NA and dst_crc32c != _NA: self.logger.debug( 'Comparing crc32c for %s and %s', src_url_str, dst_url_str) return src_crc32c == dst_crc32c if not self._WarnIfMissingCloudHash(src_url_str, src_crc32c, src_md5): self._WarnIfMissingCloudHash(dst_url_str, dst_crc32c, dst_md5) # Without checksums to compare we depend only on basic size comparison. return True def __iter__(self): """Iterates over src/dst URLs and produces a _DiffToApply sequence. Yields: The _DiffToApply. """ # Strip trailing slashes, if any, so we compute tail length against # consistent position regardless of whether trailing slashes were included # or not in URL. base_src_url_len = len(self.base_src_url.GetUrlString().rstrip('/\\')) base_dst_url_len = len(self.base_dst_url.GetUrlString().rstrip('/\\')) src_url_str = dst_url_str = None # Invariant: After each yield, the URLs in src_url_str, dst_url_str, # self.sorted_src_urls_it, and self.sorted_dst_urls_it are not yet # processed. Each time we encounter None in src_url_str or dst_url_str we # populate from the respective iterator, and we reset one or the other value # to None after yielding an action that disposes of that URL. while not self.sorted_src_urls_it.IsEmpty() or src_url_str is not None: if src_url_str is None: (src_url_str, src_size, src_crc32c, src_md5) = self._ParseTmpFileLine( self.sorted_src_urls_it.next()) # Skip past base URL and normalize slashes so we can compare across # clouds/file systems (including Windows). src_url_str_to_check = src_url_str[base_src_url_len:].replace('\\', '/') dst_url_str_would_copy_to = copy_helper.ConstructDstUrl( self.base_src_url, StorageUrlFromString(src_url_str), True, True, True, self.base_dst_url, False, self.recursion_requested).GetUrlString() if self.sorted_dst_urls_it.IsEmpty(): # We've reached end of dst URLs, so copy src to dst. yield _DiffToApply( src_url_str, dst_url_str_would_copy_to, _DiffAction.COPY) src_url_str = None continue if not dst_url_str: (dst_url_str, dst_size, dst_crc32c, dst_md5) = ( self._ParseTmpFileLine(self.sorted_dst_urls_it.next())) # Skip past base URL and normalize slashes so we can compare acros # clouds/file systems (including Windows). dst_url_str_to_check = dst_url_str[base_dst_url_len:].replace('\\', '/') if src_url_str_to_check < dst_url_str_to_check: # There's no dst object corresponding to src object, so copy src to dst. yield _DiffToApply( src_url_str, dst_url_str_would_copy_to, _DiffAction.COPY) src_url_str = None elif src_url_str_to_check > dst_url_str_to_check: # dst object without a corresponding src object, so remove dst if -d # option was specified. if self.delete_extras: yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE) dst_url_str = None else: # There is a dst object corresponding to src object, so check if objects # match. if self._ObjectsMatch( src_url_str, src_size, src_crc32c, src_md5, dst_url_str, dst_size, dst_crc32c, dst_md5): # Continue iterating without yielding a _DiffToApply. src_url_str = None dst_url_str = None else: yield _DiffToApply(src_url_str, dst_url_str, _DiffAction.COPY) dst_url_str = None # If -d option specified any files/objects left in dst iteration should be # removed. if not self.delete_extras: return if dst_url_str: yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE) dst_url_str = None for line in self.sorted_dst_urls_it: (dst_url_str, _, _, _) = self._ParseTmpFileLine(line) yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE)
def NameExpansionIterator(command_name, proj_id_handler, headers, debug, bucket_storage_uri_class, uri_strs, recursion_requested, have_existing_dst_container=None, flat=True, all_versions=False, for_all_version_delete=False): """ Static factory function for instantiating _NameExpansionIterator, which wraps the resulting iterator in a PluralityCheckableIterator and checks that it is non-empty. Also, allows uri_strs can be either an array or an iterator. Args: command_name: name of command being run. proj_id_handler: ProjectIdHandler to use for current command. headers: Dictionary containing optional HTTP headers to pass to boto. debug: Debug level to pass in to boto connection (range 0..3). bucket_storage_uri_class: Class to instantiate for cloud StorageUris. Settable for testing/mocking. uri_strs: PluralityCheckableIterator of URI strings needing expansion. recursion_requested: True if -R specified on command-line. have_existing_dst_container: Bool indicator whether this is a copy request to an existing bucket, bucket subdir, or directory. Default None value should be used in cases where this is not needed (commands other than cp). flat: Bool indicating whether bucket listings should be flattened, i.e., so the mapped-to results contain objects spanning subdirectories. all_versions: Bool indicating whether to iterate over all object versions. for_all_version_delete: Bool indicating whether this is for an all-version delete. Examples of ExpandWildcardsAndContainers with flat=True: - Calling with one of the uri_strs being 'gs://bucket' will enumerate all top-level objects, as will 'gs://bucket/' and 'gs://bucket/*'. - 'gs://bucket/**' will enumerate all objects in the bucket. - 'gs://bucket/abc' will enumerate all next-level objects under directory abc (i.e., not including subdirectories of abc) if gs://bucket/abc/* matches any objects; otherwise it will enumerate the single name gs://bucket/abc - 'gs://bucket/abc/**' will enumerate all objects under abc or any of its subdirectories. - 'file:///tmp' will enumerate all files under /tmp, as will 'file:///tmp/*' - 'file:///tmp/**' will enumerate all files under /tmp or any of its subdirectories. Example if flat=False: calling with gs://bucket/abc/* lists matching objects or subdirs, but not sub-subdirs or objects beneath subdirs. Note: In step-by-step comments below we give examples assuming there's a gs://bucket with object paths: abcd/o1.txt abcd/o2.txt xyz/o1.txt xyz/o2.txt and a directory file://dir with file paths: dir/a.txt dir/b.txt dir/c/ """ uri_strs = PluralityCheckableIterator(uri_strs) name_expansion_iterator = _NameExpansionIterator( command_name, proj_id_handler, headers, debug, bucket_storage_uri_class, uri_strs, recursion_requested, have_existing_dst_container, flat, all_versions=all_versions, for_all_version_delete=for_all_version_delete) name_expansion_iterator = PluralityCheckableIterator( name_expansion_iterator) if name_expansion_iterator.is_empty(): raise CommandException('No URIs matched') return name_expansion_iterator
def _RecursePrint(self, blr): """ Expands a bucket listing reference and recurses to its children, calling _PrintInfoAboutBucketListingRef for each expanded object found. Args: blr: An instance of BucketListingRef. Returns: Tuple containing (number of object, total number of bytes) """ num_bytes = 0 num_objs = 0 if blr.HasKey(): blr_iterator = iter([blr]) elif blr.HasPrefix(): blr_iterator = self.WildcardIterator( '%s/*' % blr.GetRStrippedUriString(), all_versions=self.all_versions) elif blr.NamesBucket(): blr_iterator = self.WildcardIterator( '%s*' % blr.GetUriString(), all_versions=self.all_versions) else: # This BLR didn't come from a bucket listing. This case happens for # BLR's instantiated from a user-provided URI. blr_iterator = PluralityCheckableIterator( UriOnlyBlrExpansionIterator(self, blr, all_versions=self.all_versions)) if blr_iterator.is_empty() and not ContainsWildcard( blr.GetUriString()): raise CommandException('No such object %s' % blr.GetUriString()) for cur_blr in blr_iterator: if self.exclude_patterns: tomatch = cur_blr.GetUriString() skip = False for pattern in self.exclude_patterns: if fnmatch.fnmatch(tomatch, pattern): skip = True break if skip: continue if cur_blr.HasKey(): # Object listing. no, nb = self._PrintInfoAboutBucketListingRef(cur_blr) else: # Subdir listing. if cur_blr.GetUriString().endswith('//'): # Expand gs://bucket// into gs://bucket//* so we don't infinite # loop. This case happens when user has uploaded an object whose # name begins with a /. cur_blr = BucketListingRef( self.suri_builder.StorageUri( '%s*' % cur_blr.GetUriString()), None, None, cur_blr.headers) no, nb = self._RecursePrint(cur_blr) num_bytes += nb num_objs += no if blr.HasPrefix() and not self.summary_only: self._PrintSummaryLine(num_bytes, blr.GetUriString().encode('utf-8')) return num_objs, num_bytes
def _ExpandUriAndPrintInfo(self, uri, listing_style, should_recurse=False): """ Expands wildcards and directories/buckets for uri as needed, and calls _PrintInfoAboutBucketListingRef() on each. Args: uri: StorageUri being listed. listing_style: ListingStyle enum describing type of output desired. should_recurse: bool indicator of whether to expand recursively. Returns: Tuple (number of matching objects, number of bytes across these objects). """ # We do a two-level loop, with the outer loop iterating level-by-level from # blrs_to_expand, and the inner loop iterating the matches at the current # level, printing them, and adding any new subdirs that need expanding to # blrs_to_expand (to be picked up in the next outer loop iteration). blrs_to_expand = [BucketListingRef(uri)] num_objs = 0 num_bytes = 0 expanding_top_level = True printed_one = False num_expanded_blrs = 0 while len(blrs_to_expand): if printed_one: print blr = blrs_to_expand.pop(0) if blr.HasKey(): blr_iterator = iter([blr]) elif blr.HasPrefix(): # Bucket subdir from a previous iteration. Print "header" line only if # we're listing more than one subdir (or if it's a recursive listing), # to be consistent with the way UNIX ls works. if num_expanded_blrs > 1 or should_recurse: print '%s:' % blr.GetUriString().encode('utf-8') printed_one = True blr_iterator = self.WildcardIterator( '%s/*' % blr.GetRStrippedUriString()) elif blr.NamesBucket(): blr_iterator = self.WildcardIterator('%s*' % blr.GetUriString()) else: # This BLR didn't come from a bucket listing. This case happens for # BLR's instantiated from a user-provided URI. blr_iterator = PluralityCheckableIterator( _UriOnlyBlrExpansionIterator(self, blr)) if blr_iterator.is_empty() and not ContainsWildcard(uri): raise CommandException('No such object %s' % uri) for cur_blr in blr_iterator: num_expanded_blrs = num_expanded_blrs + 1 if cur_blr.HasKey(): # Object listing. (no, nb) = self._PrintInfoAboutBucketListingRef( cur_blr, listing_style) num_objs += no num_bytes += nb printed_one = True else: # Subdir listing. If we're at the top level of a bucket subdir # listing don't print the list here (corresponding to how UNIX ls # dir just prints its contents, not the name followed by its # contents). if (expanding_top_level and not uri.names_bucket()) or should_recurse: if cur_blr.GetUriString().endswith('//'): # Expand gs://bucket// into gs://bucket//* so we don't infinite # loop. This case happens when user has uploaded an object whose # name begins with a /. cur_blr = BucketListingRef( self.suri_builder.StorageUri( '%s*' % cur_blr.GetUriString()), None, None, cur_blr.headers) blrs_to_expand.append(cur_blr) # Don't include the subdir name in the output if we're doing a # recursive listing, as it will be printed as 'subdir:' when we get # to the prefix expansion, the next iteration of the main loop. else: if listing_style == ListingStyle.LONG: print '%-33s%s' % ( '', cur_blr.GetUriString().encode('utf-8')) else: print cur_blr.GetUriString().encode('utf-8') expanding_top_level = False return (num_objs, num_bytes)
def ExpandUrlAndPrint(self, url): """Iterates over the given URL and calls print functions. Args: url: StorageUrl to iterate over. Returns: (num_objects, num_bytes) total number of objects and bytes iterated. """ num_objects = 0 num_dirs = 0 num_bytes = 0 print_newline = False if url.IsBucket() or self.should_recurse: # IsBucket() implies a top-level listing. if url.IsBucket(): self._print_bucket_header_func(url) return self._RecurseExpandUrlAndPrint(url.url_string, print_initial_newline=False) else: # User provided a prefix or object URL, but it's impossible to tell # which until we do a listing and see what matches. top_level_iterator = PluralityCheckableIterator( self._iterator_func( url.CreatePrefixUrl(wildcard_suffix=None), all_versions=self.all_versions).IterAll( expand_top_level_buckets=True, bucket_listing_fields=self.bucket_listing_fields)) plurality = top_level_iterator.HasPlurality() try: top_level_iterator.PeekException() except EncryptionException: # Detailed listing on a single object can perform a GetObjectMetadata # call, which raises if a matching encryption key isn't found. # Re-iterate without requesting encrypted fields. top_level_iterator = PluralityCheckableIterator( self._iterator_func( url.CreatePrefixUrl(wildcard_suffix=None), all_versions=self.all_versions). IterAll( expand_top_level_buckets=True, bucket_listing_fields=UNENCRYPTED_FULL_LISTING_FIELDS)) plurality = top_level_iterator.HasPlurality() for blr in top_level_iterator: if self._MatchesExcludedPattern(blr): continue if blr.IsObject(): nd = 0 no, nb = self._print_object_func(blr) print_newline = True elif blr.IsPrefix(): if print_newline: self._print_newline_func() else: print_newline = True if plurality and self.list_subdir_contents: self._print_dir_header_func(blr) elif plurality and not self.list_subdir_contents: print_newline = False expansion_url_str = StorageUrlFromString( blr.url_string).CreatePrefixUrl( wildcard_suffix='*' if self. list_subdir_contents else None) nd, no, nb = self._RecurseExpandUrlAndPrint( expansion_url_str) self._print_dir_summary_func(nb, blr) else: # We handle all buckets at the top level, so this should never happen. raise CommandException( 'Sub-level iterator returned a CsBucketListingRef of type Bucket' ) num_objects += no num_dirs += nd num_bytes += nb return num_dirs, num_objects, num_bytes
class _DiffIterator(object): """Iterator yielding sequence of _DiffToApply objects.""" def __init__(self, command_obj, base_src_url, base_dst_url): self.command_obj = command_obj self.compute_file_checksums = command_obj.compute_file_checksums self.delete_extras = command_obj.delete_extras self.recursion_requested = command_obj.recursion_requested self.logger = self.command_obj.logger self.base_src_url = base_src_url self.base_dst_url = base_dst_url self.logger.info("Building synchronization state...") (src_fh, self.sorted_list_src_file_name) = tempfile.mkstemp(prefix="gsutil-rsync-src-") _tmp_files.append(self.sorted_list_src_file_name) (dst_fh, self.sorted_list_dst_file_name) = tempfile.mkstemp(prefix="gsutil-rsync-dst-") _tmp_files.append(self.sorted_list_dst_file_name) # Close the file handles; the file will be opened in write mode by # _ListUrlRootFunc. os.close(src_fh) os.close(dst_fh) # Build sorted lists of src and dst URLs in parallel. To do this, pass args # to _ListUrlRootFunc as tuple (base_url_str, out_filename, desc) # where base_url_str is the starting URL string for listing. args_iter = iter( [ (self.base_src_url.url_string, self.sorted_list_src_file_name, "source"), (self.base_dst_url.url_string, self.sorted_list_dst_file_name, "destination"), ] ) # Contains error message from non-retryable listing failure. command_obj.non_retryable_listing_failures = 0 shared_attrs = ["non_retryable_listing_failures"] command_obj.Apply( _ListUrlRootFunc, args_iter, _RootListingExceptionHandler, shared_attrs, arg_checker=DummyArgChecker, parallel_operations_override=True, fail_on_error=True, ) if command_obj.non_retryable_listing_failures: raise CommandException("Caught non-retryable exception - aborting rsync") self.sorted_list_src_file = open(self.sorted_list_src_file_name, "r") self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, "r") # Wrap iterators in PluralityCheckableIterator so we can check emptiness. self.sorted_src_urls_it = PluralityCheckableIterator(iter(self.sorted_list_src_file)) self.sorted_dst_urls_it = PluralityCheckableIterator(iter(self.sorted_list_dst_file)) def _ParseTmpFileLine(self, line): """Parses output from _BuildTmpOutputLine. Parses into tuple: (URL, size, crc32c, md5) where crc32c and/or md5 can be _NA. Args: line: The line to parse. Returns: Parsed tuple: (url, size, crc32c, md5) """ (encoded_url, size, crc32c, md5) = line.split() return (_DecodeUrl(encoded_url), int(size), crc32c, md5.strip()) def _WarnIfMissingCloudHash(self, url_str, crc32c, md5): """Warns if given url_str is a cloud URL and is missing both crc32c and md5. Args: url_str: Destination URL string. crc32c: Destination CRC32c. md5: Destination MD5. Returns: True if issued warning. """ # One known way this can currently happen is when rsync'ing objects larger # than 5 GB from S3 (for which the etag is not an MD5). if StorageUrlFromString(url_str).IsCloudUrl() and crc32c == _NA and md5 == _NA: self.logger.warn("Found no hashes to validate %s. Integrity cannot be assured without " "hashes.", url_str) return True return False def _ObjectsMatch(self, src_url_str, src_size, src_crc32c, src_md5, dst_url_str, dst_size, dst_crc32c, dst_md5): """Returns True if src and dst objects are the same. Uses size plus whatever checksums are available. Args: src_url_str: Source URL string. src_size: Source size src_crc32c: Source CRC32c. src_md5: Source MD5. dst_url_str: Destination URL string. dst_size: Destination size dst_crc32c: Destination CRC32c. dst_md5: Destination MD5. Returns: True/False. """ # Note: This function is called from __iter__, which is called from the # Command.Apply driver. Thus, all checksum computation will be run in a # single thread, which is good (having multiple threads concurrently # computing checksums would thrash the disk). if src_size != dst_size: return False if self.compute_file_checksums: (src_crc32c, src_md5, dst_crc32c, dst_md5) = _ComputeNeededFileChecksums( self.logger, src_url_str, src_size, src_crc32c, src_md5, dst_url_str, dst_size, dst_crc32c, dst_md5 ) if src_md5 != _NA and dst_md5 != _NA: self.logger.debug("Comparing md5 for %s and %s", src_url_str, dst_url_str) return src_md5 == dst_md5 if src_crc32c != _NA and dst_crc32c != _NA: self.logger.debug("Comparing crc32c for %s and %s", src_url_str, dst_url_str) return src_crc32c == dst_crc32c if not self._WarnIfMissingCloudHash(src_url_str, src_crc32c, src_md5): self._WarnIfMissingCloudHash(dst_url_str, dst_crc32c, dst_md5) # Without checksums to compare we depend only on basic size comparison. return True def __iter__(self): """Iterates over src/dst URLs and produces a _DiffToApply sequence. Yields: The _DiffToApply. """ # Strip trailing slashes, if any, so we compute tail length against # consistent position regardless of whether trailing slashes were included # or not in URL. base_src_url_len = len(self.base_src_url.url_string.rstrip("/\\")) base_dst_url_len = len(self.base_dst_url.url_string.rstrip("/\\")) src_url_str = dst_url_str = None # Invariant: After each yield, the URLs in src_url_str, dst_url_str, # self.sorted_src_urls_it, and self.sorted_dst_urls_it are not yet # processed. Each time we encounter None in src_url_str or dst_url_str we # populate from the respective iterator, and we reset one or the other value # to None after yielding an action that disposes of that URL. while not self.sorted_src_urls_it.IsEmpty() or src_url_str is not None: if src_url_str is None: (src_url_str, src_size, src_crc32c, src_md5) = self._ParseTmpFileLine(self.sorted_src_urls_it.next()) # Skip past base URL and normalize slashes so we can compare across # clouds/file systems (including Windows). src_url_str_to_check = _EncodeUrl(src_url_str[base_src_url_len:].replace("\\", "/")) dst_url_str_would_copy_to = copy_helper.ConstructDstUrl( self.base_src_url, StorageUrlFromString(src_url_str), True, True, self.base_dst_url, False, self.recursion_requested, ).url_string if self.sorted_dst_urls_it.IsEmpty(): # We've reached end of dst URLs, so copy src to dst. yield _DiffToApply(src_url_str, dst_url_str_would_copy_to, _DiffAction.COPY) src_url_str = None continue if not dst_url_str: (dst_url_str, dst_size, dst_crc32c, dst_md5) = self._ParseTmpFileLine(self.sorted_dst_urls_it.next()) # Skip past base URL and normalize slashes so we can compare acros # clouds/file systems (including Windows). dst_url_str_to_check = _EncodeUrl(dst_url_str[base_dst_url_len:].replace("\\", "/")) if src_url_str_to_check < dst_url_str_to_check: # There's no dst object corresponding to src object, so copy src to dst. yield _DiffToApply(src_url_str, dst_url_str_would_copy_to, _DiffAction.COPY) src_url_str = None elif src_url_str_to_check > dst_url_str_to_check: # dst object without a corresponding src object, so remove dst if -d # option was specified. if self.delete_extras: yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE) dst_url_str = None else: # There is a dst object corresponding to src object, so check if objects # match. if self._ObjectsMatch( src_url_str, src_size, src_crc32c, src_md5, dst_url_str, dst_size, dst_crc32c, dst_md5 ): # Continue iterating without yielding a _DiffToApply. pass else: yield _DiffToApply(src_url_str, dst_url_str, _DiffAction.COPY) src_url_str = None dst_url_str = None # If -d option specified any files/objects left in dst iteration should be # removed. if not self.delete_extras: return if dst_url_str: yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE) dst_url_str = None for line in self.sorted_dst_urls_it: (dst_url_str, _, _, _) = self._ParseTmpFileLine(line) yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE)
def _ExpandUriAndPrintInfo(self, uri, listing_style, should_recurse=False): """ Expands wildcards and directories/buckets for uri as needed, and calls _PrintInfoAboutBucketListingRef() on each. Args: uri: StorageUri being listed. listing_style: ListingStyle enum describing type of output desired. should_recurse: bool indicator of whether to expand recursively. Returns: Tuple (number of matching objects, number of bytes across these objects). """ # We do a two-level loop, with the outer loop iterating level-by-level from # blrs_to_expand, and the inner loop iterating the matches at the current # level, printing them, and adding any new subdirs that need expanding to # blrs_to_expand (to be picked up in the next outer loop iteration). blrs_to_expand = [BucketListingRef(uri)] num_objs = 0 num_bytes = 0 expanding_top_level = True printed_one = False num_expanded_blrs = 0 while len(blrs_to_expand): if printed_one: print blr = blrs_to_expand.pop(0) if blr.HasKey(): blr_iterator = iter([blr]) elif blr.HasPrefix(): # Bucket subdir from a previous iteration. Print "header" line only if # we're listing more than one subdir (or if it's a recursive listing), # to be consistent with the way UNIX ls works. if num_expanded_blrs > 1 or should_recurse: print '%s:' % blr.GetUriString().encode('utf-8') printed_one = True blr_iterator = self.WildcardIterator('%s/*' % blr.GetRStrippedUriString(), all_versions=self.all_versions) elif blr.NamesBucket(): blr_iterator = self.WildcardIterator('%s*' % blr.GetUriString(), all_versions=self.all_versions) else: # This BLR didn't come from a bucket listing. This case happens for # BLR's instantiated from a user-provided URI. blr_iterator = PluralityCheckableIterator( _UriOnlyBlrExpansionIterator( self, blr, all_versions=self.all_versions)) if blr_iterator.is_empty() and not ContainsWildcard(uri): raise CommandException('No such object %s' % uri) for cur_blr in blr_iterator: num_expanded_blrs = num_expanded_blrs + 1 if cur_blr.HasKey(): # Object listing. (no, nb) = self._PrintInfoAboutBucketListingRef( cur_blr, listing_style) num_objs += no num_bytes += nb printed_one = True else: # Subdir listing. If we're at the top level of a bucket subdir # listing don't print the list here (corresponding to how UNIX ls # dir just prints its contents, not the name followed by its # contents). if (expanding_top_level and not uri.names_bucket()) or should_recurse: if cur_blr.GetUriString().endswith('//'): # Expand gs://bucket// into gs://bucket//* so we don't infinite # loop. This case happens when user has uploaded an object whose # name begins with a /. cur_blr = BucketListingRef(self.suri_builder.StorageUri( '%s*' % cur_blr.GetUriString()), None, None, cur_blr.headers) blrs_to_expand.append(cur_blr) # Don't include the subdir name in the output if we're doing a # recursive listing, as it will be printed as 'subdir:' when we get # to the prefix expansion, the next iteration of the main loop. else: if listing_style == ListingStyle.LONG: print '%-33s%s' % ( '', cur_blr.GetUriString().encode('utf-8')) else: print cur_blr.GetUriString().encode('utf-8') expanding_top_level = False return (num_objs, num_bytes)
def __iter__(self): """Iterates over all source URLs passed to the iterator. For each src url, expands wildcards, object-less bucket names, subdir bucket names, and directory names, and generates a flat listing of all the matching objects/files. You should instantiate this object using the static factory function NameExpansionIterator, because consumers of this iterator need the PluralityCheckableIterator wrapper built by that function. Yields: gslib.name_expansion.NameExpansionResult. Raises: CommandException: if errors encountered. """ for url_str in self.url_strs: storage_url = StorageUrlFromString(url_str) if storage_url.IsFileUrl() and storage_url.IsStream(): if self.url_strs.has_plurality: raise CommandException( 'Multiple URL strings are not supported ' 'with streaming ("-") URLs.') yield NameExpansionResult(storage_url, False, False, storage_url) continue # Step 1: Expand any explicitly specified wildcards. The output from this # step is an iterator of BucketListingRef. # Starting with gs://buck*/abc* this step would expand to gs://bucket/abcd src_names_bucket = False if (storage_url.IsCloudUrl() and storage_url.IsBucket() and not self.recursion_requested): # UNIX commands like rm and cp will omit directory references. # If url_str refers only to buckets and we are not recursing, # then produce references of type BUCKET, because they are guaranteed # to pass through Step 2 and be omitted in Step 3. post_step1_iter = PluralityCheckableIterator( self.WildcardIterator(url_str).IterBuckets( bucket_fields=['id'])) else: # Get a list of objects and prefixes, expanding the top level for # any listed buckets. If our source is a bucket, however, we need # to treat all of the top level expansions as names_container=True. post_step1_iter = PluralityCheckableIterator( self.WildcardIterator(url_str).IterAll( bucket_listing_fields=['name'], expand_top_level_buckets=True)) if storage_url.IsCloudUrl() and storage_url.IsBucket(): src_names_bucket = True # Step 2: Expand bucket subdirs. The output from this # step is an iterator of (names_container, BucketListingRef). # Starting with gs://bucket/abcd this step would expand to: # iter([(True, abcd/o1.txt), (True, abcd/o2.txt)]). subdir_exp_wildcard = self._flatness_wildcard[ self.recursion_requested] if self.recursion_requested: post_step2_iter = _ImplicitBucketSubdirIterator( self, post_step1_iter, subdir_exp_wildcard) else: post_step2_iter = _NonContainerTuplifyIterator(post_step1_iter) post_step2_iter = PluralityCheckableIterator(post_step2_iter) # Because we actually perform and check object listings here, this will # raise if url_args includes a non-existent object. However, # plurality_checkable_iterator will buffer the exception for us, not # raising it until the iterator is actually asked to yield the first # result. if post_step2_iter.IsEmpty(): if self.continue_on_error: try: raise CommandException('No URLs matched: %s' % url_str) except CommandException, e: # Yield a specialized tuple of (exception, stack_trace) to # the wrapping PluralityCheckableIterator. yield (e, sys.exc_info()[2]) else: raise CommandException('No URLs matched: %s' % url_str) # Step 3. Omit any directories, buckets, or bucket subdirectories for # non-recursive expansions. post_step3_iter = PluralityCheckableIterator( _OmitNonRecursiveIterator(post_step2_iter, self.recursion_requested, self.command_name, self.cmd_supports_recursion, self.logger)) src_url_expands_to_multi = post_step3_iter.HasPlurality() is_multi_source_request = (self.url_strs.has_plurality or src_url_expands_to_multi) # Step 4. Expand directories and buckets. This step yields the iterated # values. Starting with gs://bucket this step would expand to: # [abcd/o1.txt, abcd/o2.txt, xyz/o1.txt, xyz/o2.txt] # Starting with file://dir this step would expand to: # [dir/a.txt, dir/b.txt, dir/c/] for (names_container, blr) in post_step3_iter: src_names_container = src_names_bucket or names_container if blr.IsObject(): yield NameExpansionResult(storage_url, is_multi_source_request, src_names_container, blr.storage_url) else: # Use implicit wildcarding to do the enumeration. # At this point we are guaranteed that: # - Recursion has been requested because non-object entries are # filtered in step 3 otherwise. # - This is a prefix or bucket subdirectory because only # non-recursive iterations product bucket references. expanded_url = StorageUrlFromString(blr.url_string) if expanded_url.IsFileUrl(): # Convert dir to implicit recursive wildcard. url_to_iterate = '%s%s%s' % (blr, os.sep, subdir_exp_wildcard) else: # Convert subdir to implicit recursive wildcard. url_to_iterate = expanded_url.CreatePrefixUrl( wildcard_suffix=subdir_exp_wildcard) wc_iter = PluralityCheckableIterator( self.WildcardIterator(url_to_iterate).IterObjects( bucket_listing_fields=['name'])) src_url_expands_to_multi = (src_url_expands_to_multi or wc_iter.HasPlurality()) is_multi_source_request = (self.url_strs.has_plurality or src_url_expands_to_multi) # This will be a flattened listing of all underlying objects in the # subdir. for blr in wc_iter: yield NameExpansionResult(storage_url, is_multi_source_request, True, blr.storage_url)
def __iter__(self): for uri_str in self.uri_strs: # Step 1: Expand any explicitly specified wildcards. The output from this # step is an iterator of BucketListingRef. # Starting with gs://buck*/abc* this step would expand to gs://bucket/abcd if ContainsWildcard(uri_str): post_step1_iter = self._WildcardIterator(uri_str) else: suri = self.suri_builder.StorageUri(uri_str) post_step1_iter = iter([BucketListingRef(suri)]) post_step1_iter = PluralityCheckableIterator(post_step1_iter) # Step 2: Expand bucket subdirs and versions. The output from this # step is an iterator of (names_container, BucketListingRef). # Starting with gs://bucket/abcd this step would expand to: # iter([(True, abcd/o1.txt), (True, abcd/o2.txt)]). if self.flat and self.recursion_requested: post_step2_iter = _ImplicitBucketSubdirIterator(self, post_step1_iter, self.flat) elif self.all_versions: post_step2_iter = _AllVersionIterator(self, post_step1_iter, headers=self.headers) else: post_step2_iter = _NonContainerTuplifyIterator(post_step1_iter) post_step2_iter = PluralityCheckableIterator(post_step2_iter) # Step 3. Expand directories and buckets. This step yields the iterated # values. Starting with gs://bucket this step would expand to: # [abcd/o1.txt, abcd/o2.txt, xyz/o1.txt, xyz/o2.txt] # Starting with file://dir this step would expand to: # [dir/a.txt, dir/b.txt, dir/c/] exp_src_bucket_listing_refs = [] wc = self._flatness_wildcard[self.flat] src_uri_expands_to_multi = (post_step1_iter.has_plurality() or post_step2_iter.has_plurality()) is_multi_src_request = (self.uri_strs.has_plurality or src_uri_expands_to_multi) if post_step2_iter.is_empty(): raise CommandException('No URIs matched: %s' % uri_str) for (names_container, blr) in post_step2_iter: if (not blr.GetUri().names_container() and (self.flat or not blr.HasPrefix())): yield NameExpansionResult(uri_str, is_multi_src_request, src_uri_expands_to_multi, names_container, blr.GetUriString(), self.have_existing_dst_container, is_latest=blr.IsLatest()) continue if not self.recursion_requested: if blr.GetUri().is_file_uri(): desc = 'directory' elif blr.GetUri().names_bucket(): desc = 'bucket' else: desc = 'bucket subdir' if self.cmd_supports_recursion: self.logger.info( 'Omitting %s "%s". (Did you mean to do %s -R?)', desc, blr.GetUri(), self.command_name) else: self.logger.info('Omitting %s "%s".', desc, blr.GetUri()) continue if blr.GetUri().is_file_uri(): # Convert dir to implicit recursive wildcard. uri_to_iterate = '%s/%s' % (blr.GetUriString(), wc) else: # Convert bucket to implicit recursive wildcard. uri_to_iterate = blr.GetUri().clone_replace_name(wc) wc_iter = PluralityCheckableIterator( self._WildcardIterator(uri_to_iterate)) src_uri_expands_to_multi = (src_uri_expands_to_multi or wc_iter.has_plurality()) is_multi_src_request = (self.uri_strs.has_plurality or src_uri_expands_to_multi) for blr in wc_iter: yield NameExpansionResult(uri_str, is_multi_src_request, src_uri_expands_to_multi, True, blr.GetUriString(), self.have_existing_dst_container, is_latest=blr.IsLatest())
def __iter__(self): for uri_str in self.uri_strs: # Step 1: Expand any explicitly specified wildcards. The output from this # step is an iterator of BucketListingRef. # Starting with gs://buck*/abc* this step would expand to gs://bucket/abcd if ContainsWildcard(uri_str): post_step1_iter = self._WildcardIterator(uri_str) else: suri = self.suri_builder.StorageUri(uri_str) post_step1_iter = iter([BucketListingRef(suri)]) post_step1_iter = PluralityCheckableIterator(post_step1_iter) # Step 2: Expand bucket subdirs and versions. The output from this # step is an iterator of (names_container, BucketListingRef). # Starting with gs://bucket/abcd this step would expand to: # iter([(True, abcd/o1.txt), (True, abcd/o2.txt)]). if self.flat and self.recursion_requested: post_step2_iter = _ImplicitBucketSubdirIterator( self, post_step1_iter, self.flat) elif self.all_versions: post_step2_iter = _AllVersionIterator(self, post_step1_iter, headers=self.headers) else: post_step2_iter = _NonContainerTuplifyIterator(post_step1_iter) post_step2_iter = PluralityCheckableIterator(post_step2_iter) # Step 3. Expand directories and buckets. This step yields the iterated # values. Starting with gs://bucket this step would expand to: # [abcd/o1.txt, abcd/o2.txt, xyz/o1.txt, xyz/o2.txt] # Starting with file://dir this step would expand to: # [dir/a.txt, dir/b.txt, dir/c/] exp_src_bucket_listing_refs = [] wc = self._flatness_wildcard[self.flat] src_uri_expands_to_multi = (post_step1_iter.has_plurality() or post_step2_iter.has_plurality()) is_multi_src_request = (self.uri_strs.has_plurality() or src_uri_expands_to_multi) if post_step2_iter.is_empty(): raise CommandException('No URIs matched: %s' % uri_str) for (names_container, blr) in post_step2_iter: if (not blr.GetUri().names_container() and (self.flat or not blr.HasPrefix())): yield NameExpansionResult(uri_str, is_multi_src_request, src_uri_expands_to_multi, names_container, blr.GetUriString(), self.have_existing_dst_container, is_latest=blr.IsLatest()) continue if not self.recursion_requested: if blr.GetUri().is_file_uri(): desc = 'directory' else: desc = 'bucket' print 'Omitting %s "%s". (Did you mean to do %s -R?)' % ( desc, blr.GetUri(), self.command_name) continue if blr.GetUri().is_file_uri(): # Convert dir to implicit recursive wildcard. uri_to_iterate = '%s/%s' % (blr.GetUriString(), wc) else: # Convert bucket to implicit recursive wildcard. uri_to_iterate = blr.GetUri().clone_replace_name(wc) wc_iter = PluralityCheckableIterator( self._WildcardIterator(uri_to_iterate)) src_uri_expands_to_multi = (src_uri_expands_to_multi or wc_iter.has_plurality()) is_multi_src_request = (self.uri_strs.has_plurality() or src_uri_expands_to_multi) for blr in wc_iter: yield NameExpansionResult(uri_str, is_multi_src_request, src_uri_expands_to_multi, True, blr.GetUriString(), self.have_existing_dst_container, is_latest=blr.IsLatest())
def NameExpansionIterator(command_name, proj_id_handler, headers, debug, logger, bucket_storage_uri_class, uri_strs, recursion_requested, have_existing_dst_container=None, flat=True, all_versions=False, for_all_version_delete=False, cmd_supports_recursion=True): """ Static factory function for instantiating _NameExpansionIterator, which wraps the resulting iterator in a PluralityCheckableIterator and checks that it is non-empty. Also, allows uri_strs can be either an array or an iterator. Args: command_name: name of command being run. proj_id_handler: ProjectIdHandler to use for current command. headers: Dictionary containing optional HTTP headers to pass to boto. debug: Debug level to pass in to boto connection (range 0..3). logger: logging.Logger object. bucket_storage_uri_class: Class to instantiate for cloud StorageUris. Settable for testing/mocking. uri_strs: PluralityCheckableIterator of URI strings needing expansion. recursion_requested: True if -R specified on command-line. have_existing_dst_container: Bool indicator whether this is a copy request to an existing bucket, bucket subdir, or directory. Default None value should be used in cases where this is not needed (commands other than cp). flat: Bool indicating whether bucket listings should be flattened, i.e., so the mapped-to results contain objects spanning subdirectories. all_versions: Bool indicating whether to iterate over all object versions. for_all_version_delete: Bool indicating whether this is for an all-version delete. cmd_supports_recursion: Bool indicating whether this command supports a '-R' flag. Useful for printing helpful error messages. Examples of ExpandWildcardsAndContainers with flat=True: - Calling with one of the uri_strs being 'gs://bucket' will enumerate all top-level objects, as will 'gs://bucket/' and 'gs://bucket/*'. - 'gs://bucket/**' will enumerate all objects in the bucket. - 'gs://bucket/abc' will enumerate all next-level objects under directory abc (i.e., not including subdirectories of abc) if gs://bucket/abc/* matches any objects; otherwise it will enumerate the single name gs://bucket/abc - 'gs://bucket/abc/**' will enumerate all objects under abc or any of its subdirectories. - 'file:///tmp' will enumerate all files under /tmp, as will 'file:///tmp/*' - 'file:///tmp/**' will enumerate all files under /tmp or any of its subdirectories. Example if flat=False: calling with gs://bucket/abc/* lists matching objects or subdirs, but not sub-subdirs or objects beneath subdirs. Note: In step-by-step comments below we give examples assuming there's a gs://bucket with object paths: abcd/o1.txt abcd/o2.txt xyz/o1.txt xyz/o2.txt and a directory file://dir with file paths: dir/a.txt dir/b.txt dir/c/ """ uri_strs = PluralityCheckableIterator(uri_strs) name_expansion_iterator = _NameExpansionIterator( command_name, proj_id_handler, headers, debug, logger, bucket_storage_uri_class, uri_strs, recursion_requested, have_existing_dst_container, flat, all_versions=all_versions, for_all_version_delete=for_all_version_delete, cmd_supports_recursion=cmd_supports_recursion) name_expansion_iterator = PluralityCheckableIterator(name_expansion_iterator) if name_expansion_iterator.is_empty(): raise CommandException('No URIs matched') return name_expansion_iterator
class _DiffIterator(object): """Iterator yielding sequence of _DiffToApply objects.""" def __init__(self, command_obj, base_src_url, base_dst_url): self.command_obj = command_obj self.compute_file_checksums = command_obj.compute_file_checksums self.delete_extras = command_obj.delete_extras self.recursion_requested = command_obj.recursion_requested self.logger = self.command_obj.logger self.base_src_url = base_src_url self.base_dst_url = base_dst_url self.logger.info('Building synchronization state...') (src_fh, self.sorted_list_src_file_name) = tempfile.mkstemp( prefix='gsutil-rsync-src-') _tmp_files.append(self.sorted_list_src_file_name) (dst_fh, self.sorted_list_dst_file_name) = tempfile.mkstemp( prefix='gsutil-rsync-dst-') _tmp_files.append(self.sorted_list_dst_file_name) # Close the file handles; the file will be opened in write mode by # _ListUrlRootFunc. os.close(src_fh) os.close(dst_fh) # Build sorted lists of src and dst URLs in parallel. To do this, pass args # to _ListUrlRootFunc as tuple (base_url_str, out_filename, desc) # where base_url_str is the starting URL string for listing. args_iter = iter([ (self.base_src_url.url_string, self.sorted_list_src_file_name, 'source'), (self.base_dst_url.url_string, self.sorted_list_dst_file_name, 'destination') ]) # Contains error message from non-retryable listing failure. command_obj.non_retryable_listing_failures = 0 shared_attrs = ['non_retryable_listing_failures'] command_obj.Apply(_ListUrlRootFunc, args_iter, _RootListingExceptionHandler, shared_attrs, arg_checker=DummyArgChecker, parallel_operations_override=True, fail_on_error=True) if command_obj.non_retryable_listing_failures: raise CommandException('Caught non-retryable exception - aborting rsync') self.sorted_list_src_file = open(self.sorted_list_src_file_name, 'r') self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, 'r') # Wrap iterators in PluralityCheckableIterator so we can check emptiness. self.sorted_src_urls_it = PluralityCheckableIterator( iter(self.sorted_list_src_file)) self.sorted_dst_urls_it = PluralityCheckableIterator( iter(self.sorted_list_dst_file)) def _ParseTmpFileLine(self, line): """Parses output from _BuildTmpOutputLine. Parses into tuple: (URL, size, crc32c, md5) where crc32c and/or md5 can be _NA. Args: line: The line to parse. Returns: Parsed tuple: (url, size, crc32c, md5) """ (encoded_url, size, crc32c, md5) = line.split() return (_DecodeUrl(encoded_url), int(size), crc32c, md5.strip()) def _WarnIfMissingCloudHash(self, url_str, crc32c, md5): """Warns if given url_str is a cloud URL and is missing both crc32c and md5. Args: url_str: Destination URL string. crc32c: Destination CRC32c. md5: Destination MD5. Returns: True if issued warning. """ # One known way this can currently happen is when rsync'ing objects larger # than 5 GB from S3 (for which the etag is not an MD5). if (StorageUrlFromString(url_str).IsCloudUrl() and crc32c == _NA and md5 == _NA): self.logger.warn( 'Found no hashes to validate %s. Integrity cannot be assured without ' 'hashes.', url_str) return True return False def _ObjectsMatch(self, src_url_str, src_size, src_crc32c, src_md5, dst_url_str, dst_size, dst_crc32c, dst_md5): """Returns True if src and dst objects are the same. Uses size plus whatever checksums are available. Args: src_url_str: Source URL string. src_size: Source size src_crc32c: Source CRC32c. src_md5: Source MD5. dst_url_str: Destination URL string. dst_size: Destination size dst_crc32c: Destination CRC32c. dst_md5: Destination MD5. Returns: True/False. """ # Note: This function is called from __iter__, which is called from the # Command.Apply driver. Thus, all checksum computation will be run in a # single thread, which is good (having multiple threads concurrently # computing checksums would thrash the disk). if src_size != dst_size: return False if self.compute_file_checksums: (src_crc32c, src_md5, dst_crc32c, dst_md5) = _ComputeNeededFileChecksums( self.logger, src_url_str, src_size, src_crc32c, src_md5, dst_url_str, dst_size, dst_crc32c, dst_md5) if src_md5 != _NA and dst_md5 != _NA: self.logger.debug('Comparing md5 for %s and %s', src_url_str, dst_url_str) return src_md5 == dst_md5 if src_crc32c != _NA and dst_crc32c != _NA: self.logger.debug( 'Comparing crc32c for %s and %s', src_url_str, dst_url_str) return src_crc32c == dst_crc32c if not self._WarnIfMissingCloudHash(src_url_str, src_crc32c, src_md5): self._WarnIfMissingCloudHash(dst_url_str, dst_crc32c, dst_md5) # Without checksums to compare we depend only on basic size comparison. return True def __iter__(self): """Iterates over src/dst URLs and produces a _DiffToApply sequence. Yields: The _DiffToApply. """ # Strip trailing slashes, if any, so we compute tail length against # consistent position regardless of whether trailing slashes were included # or not in URL. base_src_url_len = len(self.base_src_url.url_string.rstrip('/\\')) base_dst_url_len = len(self.base_dst_url.url_string.rstrip('/\\')) src_url_str = dst_url_str = None # Invariant: After each yield, the URLs in src_url_str, dst_url_str, # self.sorted_src_urls_it, and self.sorted_dst_urls_it are not yet # processed. Each time we encounter None in src_url_str or dst_url_str we # populate from the respective iterator, and we reset one or the other value # to None after yielding an action that disposes of that URL. while not self.sorted_src_urls_it.IsEmpty() or src_url_str is not None: if src_url_str is None: (src_url_str, src_size, src_crc32c, src_md5) = self._ParseTmpFileLine( self.sorted_src_urls_it.next()) # Skip past base URL and normalize slashes so we can compare across # clouds/file systems (including Windows). src_url_str_to_check = _EncodeUrl( src_url_str[base_src_url_len:].replace('\\', '/')) dst_url_str_would_copy_to = copy_helper.ConstructDstUrl( self.base_src_url, StorageUrlFromString(src_url_str), True, True, self.base_dst_url, False, self.recursion_requested).url_string if self.sorted_dst_urls_it.IsEmpty(): # We've reached end of dst URLs, so copy src to dst. yield _DiffToApply( src_url_str, dst_url_str_would_copy_to, _DiffAction.COPY) src_url_str = None continue if not dst_url_str: (dst_url_str, dst_size, dst_crc32c, dst_md5) = ( self._ParseTmpFileLine(self.sorted_dst_urls_it.next())) # Skip past base URL and normalize slashes so we can compare acros # clouds/file systems (including Windows). dst_url_str_to_check = _EncodeUrl( dst_url_str[base_dst_url_len:].replace('\\', '/')) if src_url_str_to_check < dst_url_str_to_check: # There's no dst object corresponding to src object, so copy src to dst. yield _DiffToApply( src_url_str, dst_url_str_would_copy_to, _DiffAction.COPY) src_url_str = None elif src_url_str_to_check > dst_url_str_to_check: # dst object without a corresponding src object, so remove dst if -d # option was specified. if self.delete_extras: yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE) dst_url_str = None else: # There is a dst object corresponding to src object, so check if objects # match. if self._ObjectsMatch( src_url_str, src_size, src_crc32c, src_md5, dst_url_str, dst_size, dst_crc32c, dst_md5): # Continue iterating without yielding a _DiffToApply. pass else: yield _DiffToApply(src_url_str, dst_url_str, _DiffAction.COPY) src_url_str = None dst_url_str = None # If -d option specified any files/objects left in dst iteration should be # removed. if not self.delete_extras: return if dst_url_str: yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE) dst_url_str = None for line in self.sorted_dst_urls_it: (dst_url_str, _, _, _) = self._ParseTmpFileLine(line) yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE)
class IterTest(object): def __init__(self): self.position = 0 def __iter__(self): return self def next(self): if self.position == 0: try: self.position += 1 raise CustomTestException('Test exception 0') except CustomTestException, e: return (e, sys.exc_info()[2]) elif self.position == 1: self.position += 1 return 1 else: raise StopIteration() pcit = PluralityCheckableIterator(IterTest()) try: for _ in pcit: pass self.fail('Expected exception 0 from iterator') except CustomTestException, e: self.assertIn(e.message, 'Test exception 0') for value in pcit: iterated_value = value self.assertEqual(iterated_value, 1)