def testPluralityCheckableIteratorWith1Elem1Exception(self):
        """Tests PluralityCheckableIterator with 2 elements.

    The second element raises an exception.
    """
        class IterTest(object):
            def __init__(self):
                self.position = 0

            def __iter__(self):
                return self

            def next(self):
                if self.position == 0:
                    self.position += 1
                    return 1
                elif self.position == 1:
                    self.position += 1
                    raise CustomTestException('Test exception')
                else:
                    raise StopIteration()

        pcit = PluralityCheckableIterator(IterTest())
        self.assertFalse(pcit.IsEmpty())
        self.assertTrue(pcit.HasPlurality())
        iterated_value = None
        try:
            for value in pcit:
                iterated_value = value
            self.fail('Expected exception from iterator')
        except CustomTestException:
            pass
        self.assertEqual(iterated_value, 1)
Example #2
0
File: du.py Project: Hex29A/gsutil
  def _RecursePrint(self, blr):
    """
    Expands a bucket listing reference and recurses to its children, calling
    _PrintInfoAboutBucketListingRef for each expanded object found.

    Args:
      blr: An instance of BucketListingRef.

    Returns:
      Tuple containing (number of object, total number of bytes)
    """
    num_bytes = 0
    num_objs = 0

    if blr.HasKey():
      blr_iterator = iter([blr])
    elif blr.HasPrefix():
      blr_iterator = self.WildcardIterator(
          '%s/*' % blr.GetRStrippedUriString(), all_versions=self.all_versions)
    elif blr.NamesBucket():
      blr_iterator = self.WildcardIterator(
          '%s*' % blr.GetUriString(), all_versions=self.all_versions)
    else:
      # This BLR didn't come from a bucket listing. This case happens for
      # BLR's instantiated from a user-provided URI.
      blr_iterator = PluralityCheckableIterator(
          UriOnlyBlrExpansionIterator(
              self, blr, all_versions=self.all_versions))
      if blr_iterator.is_empty() and not ContainsWildcard(blr.GetUriString()):
        raise CommandException('No such object %s' % blr.GetUriString())

    for cur_blr in blr_iterator:
      if self.exclude_patterns:
        tomatch = cur_blr.GetUriString()
        skip = False
        for pattern in self.exclude_patterns:
          if fnmatch.fnmatch(tomatch, pattern):
            skip = True
            break
        if skip:
          continue
      if cur_blr.HasKey():
        # Object listing.
        no, nb = self._PrintInfoAboutBucketListingRef(cur_blr)
      else:
        # Subdir listing.
        if cur_blr.GetUriString().endswith('//'):
          # Expand gs://bucket// into gs://bucket//* so we don't infinite
          # loop. This case happens when user has uploaded an object whose
          # name begins with a /.
          cur_blr = BucketListingRef(self.suri_builder.StorageUri(
              '%s*' % cur_blr.GetUriString()), None, None, cur_blr.headers)
        no, nb = self._RecursePrint(cur_blr)
      num_bytes += nb
      num_objs += no

    if blr.HasPrefix() and not self.summary_only:
      self._PrintSummaryLine(num_bytes, blr.GetUriString().encode('utf-8'))

    return num_objs, num_bytes
Example #3
0
    def _GetIam(self, thread_state=None):
        """Gets IAM policy for single bucket or object."""

        pattern = self.args[0]

        matches = PluralityCheckableIterator(
            self.WildcardIterator(pattern).IterAll(
                bucket_listing_fields=['name']))
        if matches.IsEmpty():
            raise CommandException('%s matched no URLs' % pattern)
        if matches.HasPlurality():
            raise CommandException(
                '%s matched more than one URL, which is not allowed by the %s '
                'command' % (pattern, self.command_name))

        storage_url = StorageUrlFromString(list(matches)[0].url_string)
        policy = self.GetIamHelper(storage_url, thread_state=thread_state)
        policy_json = json.loads(protojson.encode_message(policy))
        policy_str = json.dumps(
            policy_json,
            sort_keys=True,
            separators=(',', ': '),
            indent=2,
        )
        print(policy_str)
 def testPluralityCheckableIteratorWith0Elems(self):
     """Tests empty PluralityCheckableIterator."""
     input_list = list(range(0))
     it = iter(input_list)
     pcit = PluralityCheckableIterator(it)
     self.assertTrue(pcit.IsEmpty())
     self.assertFalse(pcit.HasPlurality())
     output_list = list(pcit)
     self.assertEqual(input_list, output_list)
 def testPluralityCheckableIteratorWith3Elems(self):
   """Tests PluralityCheckableIterator with 3 elements."""
   input_list = range(3)
   it = iter(input_list)
   pcit = PluralityCheckableIterator(it)
   self.assertFalse(pcit.is_empty())
   self.assertTrue(pcit.has_plurality())
   output_list = list(pcit)
   self.assertEqual(input_list, output_list)
Example #6
0
def NameExpansionIterator(command_name,
                          debug,
                          logger,
                          gsutil_api,
                          url_strs,
                          recursion_requested,
                          all_versions=False,
                          cmd_supports_recursion=True,
                          project_id=None,
                          continue_on_error=False):
    """Static factory function for instantiating _NameExpansionIterator.

  This wraps the resulting iterator in a PluralityCheckableIterator and checks
  that it is non-empty. Also, allows url_strs to be either an array or an
  iterator.

  Args:
    command_name: name of command being run.
    debug: Debug level to pass to underlying iterators (range 0..3).
    logger: logging.Logger object.
    gsutil_api: Cloud storage interface.  Settable for testing/mocking.
    url_strs: Iterable URL strings needing expansion.
    recursion_requested: True if -r specified on command-line.  If so,
        listings will be flattened so mapped-to results contain objects
        spanning subdirectories.
    all_versions: Bool indicating whether to iterate over all object versions.
    cmd_supports_recursion: Bool indicating whether this command supports a '-r'
        flag. Useful for printing helpful error messages.
    project_id: Project id to use for the current command.
    continue_on_error: If true, yield no-match exceptions encountered during
                       iteration instead of raising them.

  Raises:
    CommandException if underlying iterator is empty.

  Returns:
    Name expansion iterator instance.

  For example semantics, see comments in NameExpansionIterator.__init__.
  """
    url_strs = PluralityCheckableIterator(url_strs)
    name_expansion_iterator = _NameExpansionIterator(
        command_name,
        debug,
        logger,
        gsutil_api,
        url_strs,
        recursion_requested,
        all_versions=all_versions,
        cmd_supports_recursion=cmd_supports_recursion,
        project_id=project_id,
        continue_on_error=continue_on_error)
    name_expansion_iterator = PluralityCheckableIterator(
        name_expansion_iterator)
    if name_expansion_iterator.IsEmpty():
        raise CommandException('No URLs matched')
    return name_expansion_iterator
 def testPluralityCheckableIteratorWith2Elems(self):
     """Tests PluralityCheckableIterator with 2 elements."""
     input_list = range(2)
     it = iter(input_list)
     pcit = PluralityCheckableIterator(it)
     self.assertFalse(pcit.is_empty())
     self.assertTrue(pcit.has_plurality())
     output_list = list(pcit)
     self.assertEqual(input_list, output_list)
 def testPluralityCheckableIteratorWith3Elems(self):
     """Tests PluralityCheckableIterator with 3 elements."""
     input_list = range(3)
     it = iter(input_list)
     pcit = PluralityCheckableIterator(it)
     self.assertFalse(pcit.IsEmpty())
     self.assertTrue(pcit.HasPlurality())
     output_list = list(pcit)
     self.assertEqual(input_list, output_list)
Example #9
0
  def ExpandUrlAndPrint(self, url):
    """Iterates over the given URL and calls print functions.

    Args:
      url: StorageUrl to iterate over.

    Returns:
      (num_objects, num_bytes) total number of objects and bytes iterated.
    """
    num_objects = 0
    num_dirs = 0
    num_bytes = 0
    print_newline = False

    if url.IsBucket() or self.should_recurse:
      # IsBucket() implies a top-level listing.
      if url.IsBucket():
        self._print_bucket_header_func(url)
      return self._RecurseExpandUrlAndPrint(url.url_string,
                                            print_initial_newline=False)
    else:
      # User provided a prefix or object URL, but it's impossible to tell
      # which until we do a listing and see what matches.
      top_level_iterator = PluralityCheckableIterator(self._iterator_func(
          url.CreatePrefixUrl(wildcard_suffix=None),
          all_versions=self.all_versions).IterAll(
              expand_top_level_buckets=True,
              bucket_listing_fields=self.bucket_listing_fields))
      plurality = top_level_iterator.HasPlurality()

      for blr in top_level_iterator:
        if self._MatchesExcludedPattern(blr):
          continue
        if blr.IsObject():
          nd = 0
          no, nb = self._print_object_func(blr)
          print_newline = True
        elif blr.IsPrefix():
          if print_newline:
            self._print_newline_func()
          else:
            print_newline = True
          if plurality:
            self._print_dir_header_func(blr)
          expansion_url_str = StorageUrlFromString(
              blr.url_string).CreatePrefixUrl(wildcard_suffix='*')
          nd, no, nb = self._RecurseExpandUrlAndPrint(expansion_url_str)
          self._print_dir_summary_func(nb, blr)
        else:
          # We handle all buckets at the top level, so this should never happen.
          raise CommandException(
              'Sub-level iterator returned a CsBucketListingRef of type Bucket')
        num_objects += no
        num_dirs += nd
        num_bytes += nb
      return num_dirs, num_objects, num_bytes
Example #10
0
    def __init__(self, command_obj, base_src_url, base_dst_url):
        self.command_obj = command_obj
        self.compute_file_checksums = command_obj.compute_file_checksums
        self.delete_extras = command_obj.delete_extras
        self.recursion_requested = command_obj.recursion_requested
        self.logger = self.command_obj.logger
        self.base_src_url = base_src_url
        self.base_dst_url = base_dst_url
        self.logger.info('Building synchronization state...')

        (src_fh, self.sorted_list_src_file_name) = tempfile.mkstemp(
            prefix='gsutil-rsync-src-')
        _tmp_files.append(self.sorted_list_src_file_name)
        (dst_fh, self.sorted_list_dst_file_name) = tempfile.mkstemp(
            prefix='gsutil-rsync-dst-')
        _tmp_files.append(self.sorted_list_dst_file_name)
        # Close the file handles; the file will be opened in write mode by
        # _ListUrlRootFunc.
        os.close(src_fh)
        os.close(dst_fh)

        # Build sorted lists of src and dst URLs in parallel. To do this, pass args
        # to _ListUrlRootFunc as tuple (base_url_str, out_filename, desc)
        # where base_url_str is the starting URL string for listing.
        args_iter = iter([(self.base_src_url.url_string,
                           self.sorted_list_src_file_name, 'source'),
                          (self.base_dst_url.url_string,
                           self.sorted_list_dst_file_name, 'destination')])

        # Contains error message from non-retryable listing failure.
        command_obj.non_retryable_listing_failures = 0
        shared_attrs = ['non_retryable_listing_failures']
        command_obj.Apply(_ListUrlRootFunc,
                          args_iter,
                          _RootListingExceptionHandler,
                          shared_attrs,
                          arg_checker=DummyArgChecker,
                          parallel_operations_override=True,
                          fail_on_error=True)

        if command_obj.non_retryable_listing_failures:
            raise CommandException(
                'Caught non-retryable exception - aborting rsync')

        self.sorted_list_src_file = open(self.sorted_list_src_file_name, 'r')
        self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, 'r')

        # Wrap iterators in PluralityCheckableIterator so we can check emptiness.
        self.sorted_src_urls_it = PluralityCheckableIterator(
            iter(self.sorted_list_src_file))
        self.sorted_dst_urls_it = PluralityCheckableIterator(
            iter(self.sorted_list_dst_file))
Example #11
0
    def __init__(self, command_obj, base_src_url, base_dst_url):
        self.command_obj = command_obj
        self.compute_file_checksums = command_obj.compute_file_checksums
        self.delete_extras = command_obj.delete_extras
        self.recursion_requested = command_obj.recursion_requested
        self.logger = self.command_obj.logger
        self.base_src_url = base_src_url
        self.base_dst_url = base_dst_url
        self.logger.info("Building synchronization state...")

        (src_fh, self.sorted_list_src_file_name) = tempfile.mkstemp(prefix="gsutil-rsync-src-")
        _tmp_files.append(self.sorted_list_src_file_name)
        (dst_fh, self.sorted_list_dst_file_name) = tempfile.mkstemp(prefix="gsutil-rsync-dst-")
        _tmp_files.append(self.sorted_list_dst_file_name)
        # Close the file handles; the file will be opened in write mode by
        # _ListUrlRootFunc.
        os.close(src_fh)
        os.close(dst_fh)

        # Build sorted lists of src and dst URLs in parallel. To do this, pass args
        # to _ListUrlRootFunc as tuple (base_url_str, out_filename, desc)
        # where base_url_str is the starting URL string for listing.
        args_iter = iter(
            [
                (self.base_src_url.url_string, self.sorted_list_src_file_name, "source"),
                (self.base_dst_url.url_string, self.sorted_list_dst_file_name, "destination"),
            ]
        )

        # Contains error message from non-retryable listing failure.
        command_obj.non_retryable_listing_failures = 0
        shared_attrs = ["non_retryable_listing_failures"]
        command_obj.Apply(
            _ListUrlRootFunc,
            args_iter,
            _RootListingExceptionHandler,
            shared_attrs,
            arg_checker=DummyArgChecker,
            parallel_operations_override=True,
            fail_on_error=True,
        )

        if command_obj.non_retryable_listing_failures:
            raise CommandException("Caught non-retryable exception - aborting rsync")

        self.sorted_list_src_file = open(self.sorted_list_src_file_name, "r")
        self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, "r")

        # Wrap iterators in PluralityCheckableIterator so we can check emptiness.
        self.sorted_src_urls_it = PluralityCheckableIterator(iter(self.sorted_list_src_file))
        self.sorted_dst_urls_it = PluralityCheckableIterator(iter(self.sorted_list_dst_file))
Example #12
0
  def __init__(self, command_obj, base_src_url, base_dst_url):
    self.command_obj = command_obj
    self.compute_checksums = command_obj.compute_checksums
    self.delete_extras = command_obj.delete_extras
    self.recursion_requested = command_obj.recursion_requested
    self.logger = self.command_obj.logger
    self.base_src_url = base_src_url
    self.base_dst_url = base_dst_url
    self.logger.info('Building synchronization state...')

    (src_fh, self.sorted_list_src_file_name) = tempfile.mkstemp(
        prefix='gsutil-rsync-src-')
    (dst_fh, self.sorted_list_dst_file_name) = tempfile.mkstemp(
        prefix='gsutil-rsync-dst-')
    # Close the file handles; the file will be opened in write mode by
    # _ListUrlRootFunc.
    os.close(src_fh)
    os.close(dst_fh)

    # Build sorted lists of src and dst URLs in parallel. To do this, pass args
    # to _ListUrlRootFunc as tuple (url_str, out_file_name, desc).
    args_iter = iter([
        (self.base_src_url.GetUrlString(), self.sorted_list_src_file_name,
         'source'),
        (self.base_dst_url.GetUrlString(), self.sorted_list_dst_file_name,
         'destination')
    ])
    if IS_WINDOWS:
      # Don't use multi-processing on Windows (very broken).
      thread_count = 2
      process_count = 1
    else:
      # Otherwise use multi-processing, to avoid Python global thread lock
      # contention.
      thread_count = 1
      process_count = 2
    command_obj.Apply(_ListUrlRootFunc, args_iter, _RootListingExceptionHandler,
                      arg_checker=DummyArgChecker,
                      parallel_operations_override=True,
                      thread_count=thread_count, process_count=process_count,
                      fail_on_error=True)

    self.sorted_list_src_file = open(self.sorted_list_src_file_name, 'r')
    self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, 'r')

    # Wrap iterators in PluralityCheckableIterator so we can check emptiness.
    self.sorted_src_urls_it = PluralityCheckableIterator(
        iter(self.sorted_list_src_file))
    self.sorted_dst_urls_it = PluralityCheckableIterator(
        iter(self.sorted_list_dst_file))
Example #13
0
    def _GetIam(self, pattern, thread_state=None):
        """Gets IAM policy for single bucket or object."""

        matches = PluralityCheckableIterator(
            self.WildcardIterator(pattern).IterAll(
                bucket_listing_fields=['name']))
        if matches.IsEmpty():
            raise CommandException('%s matched no URLs' % pattern)
        if matches.HasPlurality():
            raise CommandException(
                '%s matched more than one URL, which is not allowed by the %s '
                'command' % (pattern, self.command_name))

        storage_url = StorageUrlFromString(list(matches)[0].url_string)
        return self.GetIamHelper(storage_url, thread_state=thread_state)
Example #14
0
    def __init__(self,
                 command_name,
                 debug,
                 gsutil_api,
                 url_strs,
                 recursion_requested,
                 all_versions=False,
                 cmd_supports_recursion=True,
                 project_id=None):
        """Initializes a _NameExpansionIterator with the inputs."""

        # Count data bytes only will be transferred/rewritten.
        # Note that the rsync command uses a different iterator, thus it is not
        # included here.
        self.count_data_bytes = command_name in ('cp', 'mv', 'rewrite')

        # Only query the file size if we are counting data bytes, as this may
        # result in stat'ing files, which is more expensive.
        bucket_listing_fields = ['size'] if self.count_data_bytes else None

        self.name_expansion_iterator = _NameExpansionIterator(
            command_name,
            debug,
            logging.getLogger('dummy'),
            gsutil_api,
            PluralityCheckableIterator(url_strs),
            recursion_requested,
            all_versions=all_versions,
            cmd_supports_recursion=cmd_supports_recursion,
            project_id=project_id,
            continue_on_error=True,
            bucket_listing_fields=bucket_listing_fields)
Example #15
0
 def __iter__(self):
     for blr in self.blr_iter:
         uri = blr.GetUri()
         if uri.names_object():
             # URI could be a bucket subdir.
             implicit_subdir_iterator = PluralityCheckableIterator(
                 self.name_expansion_instance._WildcardIterator(
                     self.name_expansion_instance.suri_builder.StorageUri(
                         '%s/%s' %
                         (uri.uri.rstrip('/'), self.name_expansion_instance.
                          _flatness_wildcard[self.flat]))))
             if not implicit_subdir_iterator.is_empty():
                 for exp_blr in implicit_subdir_iterator:
                     yield (True, exp_blr)
             else:
                 yield (False, blr)
         else:
             yield (False, blr)
Example #16
0
def NameExpansionIterator(command_name, proj_id_handler, headers, debug,
                          bucket_storage_uri_class, uri_strs,
                          recursion_requested,
                          have_existing_dst_container=None, flat=True):
  """
  Static factory function for instantiating _NameExpansionIterator, which
  wraps the resulting iterator in a PluralityCheckableIterator and checks
  that it is non-empty.

  Args are as documented in constructor for _NameExpansionIterator class.
  """
  name_expansion_iterator = _NameExpansionIterator(
      command_name, proj_id_handler, headers, debug, bucket_storage_uri_class,
      uri_strs, recursion_requested, have_existing_dst_container, flat)
  name_expansion_iterator = PluralityCheckableIterator(name_expansion_iterator)
  if name_expansion_iterator.is_empty():
    raise CommandException('No URIs matched')
  return name_expansion_iterator
Example #17
0
 def __iter__(self):
   for blr in self.blr_iter:
     uri = blr.GetUri()
     if uri.names_object():
       # URI could be a bucket subdir.
       implicit_subdir_iterator = PluralityCheckableIterator(
           self.name_expansion_instance._WildcardIterator(
               self.name_expansion_instance.suri_builder.StorageUri(
                   '%s/%s' % (uri.uri.rstrip('/'),
                   self.name_expansion_instance._flatness_wildcard[
                       self.flat]))))
       if not implicit_subdir_iterator.is_empty():
         for exp_blr in implicit_subdir_iterator:
           yield (True, exp_blr)
       else:
         yield (False, blr)
     else:
       yield (False, blr)
    def testPluralityCheckableIteratorWith2Exceptions(self):
        """Tests PluralityCheckableIterator with 2 elements that both raise."""
        class IterTest(six.Iterator):
            def __init__(self):
                self.position = 0

            def __iter__(self):
                return self

            def __next__(self):
                if self.position < 2:
                    self.position += 1
                    raise CustomTestException('Test exception %s' %
                                              self.position)
                else:
                    raise StopIteration()

        pcit = PluralityCheckableIterator(IterTest())
        try:
            pcit.PeekException()
            self.fail('Expected exception 1 from PeekException')
        except CustomTestException as e:
            self.assertIn(str(e), 'Test exception 1')
        try:
            for _ in pcit:
                pass
            self.fail('Expected exception 1 from iterator')
        except CustomTestException as e:
            self.assertIn(str(e), 'Test exception 1')
        try:
            pcit.PeekException()
            self.fail('Expected exception 2 from PeekException')
        except CustomTestException as e:
            self.assertIn(str(e), 'Test exception 2')
        try:
            for _ in pcit:
                pass
            self.fail('Expected exception 2 from iterator')
        except CustomTestException as e:
            self.assertIn(str(e), 'Test exception 2')
        for _ in pcit:
            self.fail('Expected StopIteration')
Example #19
0
 def __iter__(self):
   for blr in self.blr_iter:
     if blr.IsPrefix():
       # This is a bucket subdirectory, list objects according to the wildcard.
       prefix_url = StorageUrlFromString(blr.url_string).CreatePrefixUrl(
           wildcard_suffix=self.subdir_exp_wildcard)
       implicit_subdir_iterator = PluralityCheckableIterator(
           self.name_exp_instance.WildcardIterator(prefix_url).IterAll(
               bucket_listing_fields=self.bucket_listing_fields))
       if not implicit_subdir_iterator.IsEmpty():
         for exp_blr in implicit_subdir_iterator:
           yield (True, exp_blr)
       else:
         # Prefix that contains no objects, for example in the $folder$ case
         # or an empty filesystem directory.
         yield (False, blr)
     elif blr.IsObject():
       yield (False, blr)
     else:
       raise CommandException(
           '_ImplicitBucketSubdirIterator got a bucket reference %s' % blr)
Example #20
0
File: rsync.py Project: altock/dev
  def __init__(self, command_obj, base_src_url, base_dst_url):
    self.command_obj = command_obj
    self.compute_checksums = command_obj.compute_checksums
    self.delete_extras = command_obj.delete_extras
    self.recursion_requested = command_obj.recursion_requested
    self.logger = self.command_obj.logger
    self.base_src_url = base_src_url
    self.base_dst_url = base_dst_url
    self.logger.info('Building synchronization state...')

    (src_fh, self.sorted_list_src_file_name) = tempfile.mkstemp(
        prefix='gsutil-rsync-src-')
    (dst_fh, self.sorted_list_dst_file_name) = tempfile.mkstemp(
        prefix='gsutil-rsync-dst-')
    # Close the file handles; the file will be opened in write mode by
    # _ListUrlRootFunc.
    os.close(src_fh)
    os.close(dst_fh)

    # Build sorted lists of src and dst URLs in parallel. To do this, pass args
    # to _ListUrlRootFunc as tuple (url_str, out_file_name, desc).
    args_iter = iter([
        (self.base_src_url.GetUrlString(), self.sorted_list_src_file_name,
         'source'),
        (self.base_dst_url.GetUrlString(), self.sorted_list_dst_file_name,
         'destination')
    ])
    command_obj.Apply(_ListUrlRootFunc, args_iter, _RootListingExceptionHandler,
                      arg_checker=DummyArgChecker,
                      parallel_operations_override=True,
                      fail_on_error=True)

    self.sorted_list_src_file = open(self.sorted_list_src_file_name, 'rb')
    self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, 'rb')

    # Wrap iterators in PluralityCheckableIterator so we can check emptiness.
    self.sorted_src_urls_it = PluralityCheckableIterator(
        iter(self.sorted_list_src_file))
    self.sorted_dst_urls_it = PluralityCheckableIterator(
        iter(self.sorted_list_dst_file))
Example #21
0
  def __init__(self, command_obj, base_src_url, base_dst_url):
    self.command_obj = command_obj
    self.compute_checksums = command_obj.compute_checksums
    self.delete_extras = command_obj.delete_extras
    self.recursion_requested = command_obj.recursion_requested
    self.logger = self.command_obj.logger
    self.base_src_url = base_src_url
    self.base_dst_url = base_dst_url
    self.logger.info('Building synchronization state...')

    (src_fh, self.sorted_list_src_file_name) = tempfile.mkstemp(
        prefix='gsutil-rsync-src-')
    (dst_fh, self.sorted_list_dst_file_name) = tempfile.mkstemp(
        prefix='gsutil-rsync-dst-')
    # Close the file handles; the file will be opened in write mode by
    # _ListUrlRootFunc.
    os.close(src_fh)
    os.close(dst_fh)

    # Build sorted lists of src and dst URLs in parallel. To do this, pass args
    # to _ListUrlRootFunc as tuple (url_str, out_file_name, desc).
    args_iter = iter([
        (self.base_src_url.url_string, self.sorted_list_src_file_name,
         'source'),
        (self.base_dst_url.url_string, self.sorted_list_dst_file_name,
         'destination')
    ])
    command_obj.Apply(_ListUrlRootFunc, args_iter, _RootListingExceptionHandler,
                      arg_checker=DummyArgChecker,
                      parallel_operations_override=True,
                      fail_on_error=True)

    self.sorted_list_src_file = open(self.sorted_list_src_file_name, 'r')
    self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, 'r')

    # Wrap iterators in PluralityCheckableIterator so we can check emptiness.
    self.sorted_src_urls_it = PluralityCheckableIterator(
        iter(self.sorted_list_src_file))
    self.sorted_dst_urls_it = PluralityCheckableIterator(
        iter(self.sorted_list_dst_file))
    def testPluralityCheckableIteratorWith2Exceptions(self):
        """Tests PluralityCheckableIterator with 2 elements that both raise."""
        class IterTest(object):
            def __init__(self):
                self.position = 0

            def __iter__(self):
                return self

            def next(self):
                if self.position < 2:
                    self.position += 1
                    raise CustomTestException('Test exception %s' %
                                              self.position)
                else:
                    raise StopIteration()

        pcit = PluralityCheckableIterator(IterTest())
        try:
            pcit.PeekException()
            self.fail('Expected exception 1 from PeekException')
        except CustomTestException, e:
            self.assertIn(e.message, 'Test exception 1')
    def testPluralityCheckableIteratorReadsAheadAsNeeded(self):
        """Tests that the PCI does not unnecessarily read new elements."""
        class IterTest(six.Iterator):
            def __init__(self):
                self.position = 0

            def __iter__(self):
                return self

            def __next__(self):
                if self.position == 3:
                    raise StopIteration()
                self.position += 1

        # IsEmpty and PeekException should retrieve only 1 element from the
        # underlying iterator.
        pcit = PluralityCheckableIterator(IterTest())
        pcit.IsEmpty()
        pcit.PeekException()
        self.assertEquals(pcit.orig_iterator.position, 1)
        # HasPlurality requires populating 2 elements into the iterator.
        pcit.HasPlurality()
        self.assertEquals(pcit.orig_iterator.position, 2)
        # next should yield already-populated elements without advancing the
        # iterator.
        next(pcit)  # Yields element 1
        self.assertEquals(pcit.orig_iterator.position, 2)
        next(pcit)  # Yields element 2
        self.assertEquals(pcit.orig_iterator.position, 2)
        next(pcit)  # Yields element 3
        self.assertEquals(pcit.orig_iterator.position, 3)
        try:
            next(pcit)  # Underlying iterator is empty
            self.fail('Expected StopIteration')
        except StopIteration:
            pass
  def testPluralityCheckableIteratorReadsAheadAsNeeded(self):
    """Tests that the PCI does not unnecessarily read new elements."""

    class IterTest(object):

      def __init__(self):
        self.position = 0

      def __iter__(self):
        return self

      def next(self):
        if self.position == 3:
          raise StopIteration()
        self.position += 1

    # IsEmpty and PeekException should retrieve only 1 element from the
    # underlying iterator.
    pcit = PluralityCheckableIterator(IterTest())
    pcit.IsEmpty()
    pcit.PeekException()
    self.assertEquals(pcit.orig_iterator.position, 1)
    # HasPlurality requires populating 2 elements into the iterator.
    pcit.HasPlurality()
    self.assertEquals(pcit.orig_iterator.position, 2)
    # next should yield already-populated elements without advancing the
    # iterator.
    pcit.next()  # Yields element 1
    self.assertEquals(pcit.orig_iterator.position, 2)
    pcit.next()  # Yields element 2
    self.assertEquals(pcit.orig_iterator.position, 2)
    pcit.next()  # Yields element 3
    self.assertEquals(pcit.orig_iterator.position, 3)
    try:
      pcit.next()  # Underlying iterator is empty
      self.fail('Expected StopIteration')
    except StopIteration:
      pass
    def testPluralityCheckableIteratorWithYieldedException(self):
        """Tests PCI with an iterator that yields an exception.

    The yielded exception is in the form of a tuple and must also contain a
    stack trace.
    """
        class IterTest(six.Iterator):
            def __init__(self):
                self.position = 0

            def __iter__(self):
                return self

            def __next__(self):
                if self.position == 0:
                    try:
                        self.position += 1
                        raise CustomTestException('Test exception 0')
                    except CustomTestException as e:
                        return (e, sys.exc_info()[2])
                elif self.position == 1:
                    self.position += 1
                    return 1
                else:
                    raise StopIteration()

        pcit = PluralityCheckableIterator(IterTest())
        iterated_value = None
        try:
            for _ in pcit:
                pass
            self.fail('Expected exception 0 from iterator')
        except CustomTestException as e:
            self.assertIn(str(e), 'Test exception 0')
        for value in pcit:
            iterated_value = value
        self.assertEqual(iterated_value, 1)
Example #26
0
File: rsync.py Project: altock/dev
class _DiffIterator(object):
  """Iterator yielding sequence of _DiffToApply objects."""

  def __init__(self, command_obj, base_src_url, base_dst_url):
    self.command_obj = command_obj
    self.compute_checksums = command_obj.compute_checksums
    self.delete_extras = command_obj.delete_extras
    self.recursion_requested = command_obj.recursion_requested
    self.logger = self.command_obj.logger
    self.base_src_url = base_src_url
    self.base_dst_url = base_dst_url
    self.logger.info('Building synchronization state...')

    (src_fh, self.sorted_list_src_file_name) = tempfile.mkstemp(
        prefix='gsutil-rsync-src-')
    (dst_fh, self.sorted_list_dst_file_name) = tempfile.mkstemp(
        prefix='gsutil-rsync-dst-')
    # Close the file handles; the file will be opened in write mode by
    # _ListUrlRootFunc.
    os.close(src_fh)
    os.close(dst_fh)

    # Build sorted lists of src and dst URLs in parallel. To do this, pass args
    # to _ListUrlRootFunc as tuple (url_str, out_file_name, desc).
    args_iter = iter([
        (self.base_src_url.GetUrlString(), self.sorted_list_src_file_name,
         'source'),
        (self.base_dst_url.GetUrlString(), self.sorted_list_dst_file_name,
         'destination')
    ])
    command_obj.Apply(_ListUrlRootFunc, args_iter, _RootListingExceptionHandler,
                      arg_checker=DummyArgChecker,
                      parallel_operations_override=True,
                      fail_on_error=True)

    self.sorted_list_src_file = open(self.sorted_list_src_file_name, 'rb')
    self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, 'rb')

    # Wrap iterators in PluralityCheckableIterator so we can check emptiness.
    self.sorted_src_urls_it = PluralityCheckableIterator(
        iter(self.sorted_list_src_file))
    self.sorted_dst_urls_it = PluralityCheckableIterator(
        iter(self.sorted_list_dst_file))

  # pylint: disable=bare-except
  def CleanUpTempFiles(self):
    """Cleans up temp files.

    This function allows the main (RunCommand) function to clean up at end of
    operation. This is necessary because tempfile.NamedTemporaryFile doesn't
    allow the created file to be re-opened in read mode on Windows, so we have
    to use tempfile.mkstemp, which doesn't automatically delete temp files (see
    https://mail.python.org/pipermail/python-list/2005-December/336958.html).
    """
    try:
      self.sorted_list_src_file.close()
      self.sorted_list_dst_file.close()
      for fname in (self.sorted_list_src_file_name,
                    self.sorted_list_dst_file_name):
        os.unlink(fname)
    except:
      pass

  def _ParseTmpFileLine(self, line):
    """Parses output from _BuildTmpOutputLine.

    Parses into tuple:
      (URL, size, crc32c, md5)
    where crc32c and/or md5 can be _NA.

    Args:
      line: The line to parse.

    Returns:
      Parsed tuple: (url, size, crc32c, md5)
    """
    (encoded_url, size, crc32c, md5) = line.split()
    return (urllib.unquote_plus(encoded_url).decode(UTF8),
            int(size), crc32c, md5.strip())

  def _WarnIfMissingCloudHash(self, url_str, crc32c, md5):
    """Warns if given url_str is a cloud URL and is missing both crc32c and md5.

    Args:
      url_str: Destination URL string.
      crc32c: Destination CRC32c.
      md5: Destination MD5.

    Returns:
      True if issued warning.
    """
    # One known way this can currently happen is when rsync'ing objects larger
    # than 5GB from S3 (for which the etag is not an MD5).
    if (StorageUrlFromString(url_str).IsCloudUrl()
        and crc32c == _NA and md5 == _NA):
      self.logger.warn(
          'Found no hashes to validate %s. '
          'Integrity cannot be assured without hashes.' % url_str)
      return True
    return False

  def _ObjectsMatch(self, src_url_str, src_size, src_crc32c, src_md5,
                    dst_url_str, dst_size, dst_crc32c, dst_md5):
    """Returns True if src and dst objects are the same.

    Uses size plus whatever checksums are available.

    Args:
      src_url_str: Source URL string.
      src_size: Source size
      src_crc32c: Source CRC32c.
      src_md5: Source MD5.
      dst_url_str: Destination URL string.
      dst_size: Destination size
      dst_crc32c: Destination CRC32c.
      dst_md5: Destination MD5.

    Returns:
      True/False.
    """
    # Note: This function is called from __iter__, which is called from the
    # Command.Apply driver. Thus, all checksum computation will be run in a
    # single thread, which is good (having multiple threads concurrently
    # computing checksums would thrash the disk).
    if src_size != dst_size:
      return False
    if self.compute_checksums:
      (src_crc32c, src_md5, dst_crc32c, dst_md5) = _ComputeNeededFileChecksums(
          self.logger, src_url_str, src_size, src_crc32c, src_md5, dst_url_str,
          dst_size, dst_crc32c, dst_md5)
    if src_md5 != _NA and dst_md5 != _NA:
      self.logger.debug('Comparing md5 for %s and %s', src_url_str, dst_url_str)
      return src_md5 == dst_md5
    if src_crc32c != _NA and dst_crc32c != _NA:
      self.logger.debug(
          'Comparing crc32c for %s and %s', src_url_str, dst_url_str)
      return src_crc32c == dst_crc32c
    if not self._WarnIfMissingCloudHash(src_url_str, src_crc32c, src_md5):
      self._WarnIfMissingCloudHash(dst_url_str, dst_crc32c, dst_md5)
    # Without checksums to compare we depend only on basic size comparison.
    return True

  def __iter__(self):
    """Iterates over src/dst URLs and produces a _DiffToApply sequence.

    Yields:
      The _DiffToApply.
    """
    # Strip trailing slashes, if any, so we compute tail length against
    # consistent position regardless of whether trailing slashes were included
    # or not in URL.
    base_src_url_len = len(self.base_src_url.GetUrlString().rstrip('/\\'))
    base_dst_url_len = len(self.base_dst_url.GetUrlString().rstrip('/\\'))
    src_url_str = dst_url_str = None
    # Invariant: After each yield, the URLs in src_url_str, dst_url_str,
    # self.sorted_src_urls_it, and self.sorted_dst_urls_it are not yet
    # processed. Each time we encounter None in src_url_str or dst_url_str we
    # populate from the respective iterator, and we reset one or the other value
    # to None after yielding an action that disposes of that URL.
    while not self.sorted_src_urls_it.IsEmpty() or src_url_str is not None:
      if src_url_str is None:
        (src_url_str, src_size, src_crc32c, src_md5) = self._ParseTmpFileLine(
            self.sorted_src_urls_it.next())
        # Skip past base URL and normalize slashes so we can compare across
        # clouds/file systems (including Windows).
        src_url_str_to_check = src_url_str[base_src_url_len:].replace('\\', '/')
        dst_url_str_would_copy_to = copy_helper.ConstructDstUrl(
            self.base_src_url, StorageUrlFromString(src_url_str), True, True,
            True, self.base_dst_url, False,
            self.recursion_requested).GetUrlString()
      if self.sorted_dst_urls_it.IsEmpty():
        # We've reached end of dst URLs, so copy src to dst.
        yield _DiffToApply(
            src_url_str, dst_url_str_would_copy_to, _DiffAction.COPY)
        src_url_str = None
        continue
      if not dst_url_str:
        (dst_url_str, dst_size, dst_crc32c, dst_md5) = (
            self._ParseTmpFileLine(self.sorted_dst_urls_it.next()))
        # Skip past base URL and normalize slashes so we can compare acros
        # clouds/file systems (including Windows).
        dst_url_str_to_check = dst_url_str[base_dst_url_len:].replace('\\', '/')

      if src_url_str_to_check < dst_url_str_to_check:
        # There's no dst object corresponding to src object, so copy src to dst.
        yield _DiffToApply(
            src_url_str, dst_url_str_would_copy_to, _DiffAction.COPY)
        src_url_str = None
      elif src_url_str_to_check > dst_url_str_to_check:
        # dst object without a corresponding src object, so remove dst if -d
        # option was specified.
        if self.delete_extras:
          yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE)
        dst_url_str = None
      else:
        # There is a dst object corresponding to src object, so check if objects
        # match.
        if self._ObjectsMatch(
            src_url_str, src_size, src_crc32c, src_md5,
            dst_url_str, dst_size, dst_crc32c, dst_md5):
          # Continue iterating without yielding a _DiffToApply.
          src_url_str = None
          dst_url_str = None
        else:
          yield _DiffToApply(src_url_str, dst_url_str, _DiffAction.COPY)
          dst_url_str = None

    # If -d option specified any files/objects left in dst iteration should be
    # removed.
    if not self.delete_extras:
      return
    if dst_url_str:
      yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE)
      dst_url_str = None
    for line in self.sorted_dst_urls_it:
      (dst_url_str, _, _, _) = self._ParseTmpFileLine(line)
      yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE)
Example #27
0
def NameExpansionIterator(command_name,
                          proj_id_handler,
                          headers,
                          debug,
                          bucket_storage_uri_class,
                          uri_strs,
                          recursion_requested,
                          have_existing_dst_container=None,
                          flat=True,
                          all_versions=False,
                          for_all_version_delete=False):
    """
  Static factory function for instantiating _NameExpansionIterator, which
  wraps the resulting iterator in a PluralityCheckableIterator and checks
  that it is non-empty. Also, allows uri_strs can be either an array or an
  iterator.

  Args:
    command_name: name of command being run.
    proj_id_handler: ProjectIdHandler to use for current command.
    headers: Dictionary containing optional HTTP headers to pass to boto.
    debug: Debug level to pass in to boto connection (range 0..3).
    bucket_storage_uri_class: Class to instantiate for cloud StorageUris.
        Settable for testing/mocking.
    uri_strs: PluralityCheckableIterator of URI strings needing expansion.
    recursion_requested: True if -R specified on command-line.
    have_existing_dst_container: Bool indicator whether this is a copy
        request to an existing bucket, bucket subdir, or directory. Default
        None value should be used in cases where this is not needed (commands
        other than cp).
    flat: Bool indicating whether bucket listings should be flattened, i.e.,
        so the mapped-to results contain objects spanning subdirectories.
    all_versions: Bool indicating whether to iterate over all object versions.
    for_all_version_delete: Bool indicating whether this is for an all-version
        delete.

  Examples of ExpandWildcardsAndContainers with flat=True:
    - Calling with one of the uri_strs being 'gs://bucket' will enumerate all
      top-level objects, as will 'gs://bucket/' and 'gs://bucket/*'.
    - 'gs://bucket/**' will enumerate all objects in the bucket.
    - 'gs://bucket/abc' will enumerate all next-level objects under directory
      abc (i.e., not including subdirectories of abc) if gs://bucket/abc/*
      matches any objects; otherwise it will enumerate the single name
      gs://bucket/abc
    - 'gs://bucket/abc/**' will enumerate all objects under abc or any of its
      subdirectories.
    - 'file:///tmp' will enumerate all files under /tmp, as will
      'file:///tmp/*'
    - 'file:///tmp/**' will enumerate all files under /tmp or any of its
      subdirectories.

  Example if flat=False: calling with gs://bucket/abc/* lists matching objects
  or subdirs, but not sub-subdirs or objects beneath subdirs.

  Note: In step-by-step comments below we give examples assuming there's a
  gs://bucket with object paths:
    abcd/o1.txt
    abcd/o2.txt
    xyz/o1.txt
    xyz/o2.txt
  and a directory file://dir with file paths:
    dir/a.txt
    dir/b.txt
    dir/c/
  """
    uri_strs = PluralityCheckableIterator(uri_strs)
    name_expansion_iterator = _NameExpansionIterator(
        command_name,
        proj_id_handler,
        headers,
        debug,
        bucket_storage_uri_class,
        uri_strs,
        recursion_requested,
        have_existing_dst_container,
        flat,
        all_versions=all_versions,
        for_all_version_delete=for_all_version_delete)
    name_expansion_iterator = PluralityCheckableIterator(
        name_expansion_iterator)
    if name_expansion_iterator.is_empty():
        raise CommandException('No URIs matched')
    return name_expansion_iterator
Example #28
0
    def _RecursePrint(self, blr):
        """
    Expands a bucket listing reference and recurses to its children, calling
    _PrintInfoAboutBucketListingRef for each expanded object found.

    Args:
      blr: An instance of BucketListingRef.

    Returns:
      Tuple containing (number of object, total number of bytes)
    """
        num_bytes = 0
        num_objs = 0

        if blr.HasKey():
            blr_iterator = iter([blr])
        elif blr.HasPrefix():
            blr_iterator = self.WildcardIterator(
                '%s/*' % blr.GetRStrippedUriString(),
                all_versions=self.all_versions)
        elif blr.NamesBucket():
            blr_iterator = self.WildcardIterator(
                '%s*' % blr.GetUriString(), all_versions=self.all_versions)
        else:
            # This BLR didn't come from a bucket listing. This case happens for
            # BLR's instantiated from a user-provided URI.
            blr_iterator = PluralityCheckableIterator(
                UriOnlyBlrExpansionIterator(self,
                                            blr,
                                            all_versions=self.all_versions))
            if blr_iterator.is_empty() and not ContainsWildcard(
                    blr.GetUriString()):
                raise CommandException('No such object %s' %
                                       blr.GetUriString())

        for cur_blr in blr_iterator:
            if self.exclude_patterns:
                tomatch = cur_blr.GetUriString()
                skip = False
                for pattern in self.exclude_patterns:
                    if fnmatch.fnmatch(tomatch, pattern):
                        skip = True
                        break
                if skip:
                    continue
            if cur_blr.HasKey():
                # Object listing.
                no, nb = self._PrintInfoAboutBucketListingRef(cur_blr)
            else:
                # Subdir listing.
                if cur_blr.GetUriString().endswith('//'):
                    # Expand gs://bucket// into gs://bucket//* so we don't infinite
                    # loop. This case happens when user has uploaded an object whose
                    # name begins with a /.
                    cur_blr = BucketListingRef(
                        self.suri_builder.StorageUri(
                            '%s*' % cur_blr.GetUriString()), None, None,
                        cur_blr.headers)
                no, nb = self._RecursePrint(cur_blr)
            num_bytes += nb
            num_objs += no

        if blr.HasPrefix() and not self.summary_only:
            self._PrintSummaryLine(num_bytes,
                                   blr.GetUriString().encode('utf-8'))

        return num_objs, num_bytes
Example #29
0
    def _ExpandUriAndPrintInfo(self, uri, listing_style, should_recurse=False):
        """
    Expands wildcards and directories/buckets for uri as needed, and
    calls _PrintInfoAboutBucketListingRef() on each.

    Args:
      uri: StorageUri being listed.
      listing_style: ListingStyle enum describing type of output desired.
      should_recurse: bool indicator of whether to expand recursively.

    Returns:
      Tuple (number of matching objects, number of bytes across these objects).
    """
        # We do a two-level loop, with the outer loop iterating level-by-level from
        # blrs_to_expand, and the inner loop iterating the matches at the current
        # level, printing them, and adding any new subdirs that need expanding to
        # blrs_to_expand (to be picked up in the next outer loop iteration).
        blrs_to_expand = [BucketListingRef(uri)]
        num_objs = 0
        num_bytes = 0
        expanding_top_level = True
        printed_one = False
        num_expanded_blrs = 0
        while len(blrs_to_expand):
            if printed_one:
                print
            blr = blrs_to_expand.pop(0)
            if blr.HasKey():
                blr_iterator = iter([blr])
            elif blr.HasPrefix():
                # Bucket subdir from a previous iteration. Print "header" line only if
                # we're listing more than one subdir (or if it's a recursive listing),
                # to be consistent with the way UNIX ls works.
                if num_expanded_blrs > 1 or should_recurse:
                    print '%s:' % blr.GetUriString().encode('utf-8')
                    printed_one = True
                blr_iterator = self.WildcardIterator(
                    '%s/*' % blr.GetRStrippedUriString())
            elif blr.NamesBucket():
                blr_iterator = self.WildcardIterator('%s*' %
                                                     blr.GetUriString())
            else:
                # This BLR didn't come from a bucket listing. This case happens for
                # BLR's instantiated from a user-provided URI.
                blr_iterator = PluralityCheckableIterator(
                    _UriOnlyBlrExpansionIterator(self, blr))
                if blr_iterator.is_empty() and not ContainsWildcard(uri):
                    raise CommandException('No such object %s' % uri)
            for cur_blr in blr_iterator:
                num_expanded_blrs = num_expanded_blrs + 1
                if cur_blr.HasKey():
                    # Object listing.
                    (no, nb) = self._PrintInfoAboutBucketListingRef(
                        cur_blr, listing_style)
                    num_objs += no
                    num_bytes += nb
                    printed_one = True
                else:
                    # Subdir listing. If we're at the top level of a bucket subdir
                    # listing don't print the list here (corresponding to how UNIX ls
                    # dir just prints its contents, not the name followed by its
                    # contents).
                    if (expanding_top_level
                            and not uri.names_bucket()) or should_recurse:
                        if cur_blr.GetUriString().endswith('//'):
                            # Expand gs://bucket// into gs://bucket//* so we don't infinite
                            # loop. This case happens when user has uploaded an object whose
                            # name begins with a /.
                            cur_blr = BucketListingRef(
                                self.suri_builder.StorageUri(
                                    '%s*' % cur_blr.GetUriString()), None,
                                None, cur_blr.headers)
                        blrs_to_expand.append(cur_blr)
                    # Don't include the subdir name in the output if we're doing a
                    # recursive listing, as it will be printed as 'subdir:' when we get
                    # to the prefix expansion, the next iteration of the main loop.
                    else:
                        if listing_style == ListingStyle.LONG:
                            print '%-33s%s' % (
                                '', cur_blr.GetUriString().encode('utf-8'))
                        else:
                            print cur_blr.GetUriString().encode('utf-8')
            expanding_top_level = False
        return (num_objs, num_bytes)
Example #30
0
    def ExpandUrlAndPrint(self, url):
        """Iterates over the given URL and calls print functions.

    Args:
      url: StorageUrl to iterate over.

    Returns:
      (num_objects, num_bytes) total number of objects and bytes iterated.
    """
        num_objects = 0
        num_dirs = 0
        num_bytes = 0
        print_newline = False

        if url.IsBucket() or self.should_recurse:
            # IsBucket() implies a top-level listing.
            if url.IsBucket():
                self._print_bucket_header_func(url)
            return self._RecurseExpandUrlAndPrint(url.url_string,
                                                  print_initial_newline=False)
        else:
            # User provided a prefix or object URL, but it's impossible to tell
            # which until we do a listing and see what matches.
            top_level_iterator = PluralityCheckableIterator(
                self._iterator_func(
                    url.CreatePrefixUrl(wildcard_suffix=None),
                    all_versions=self.all_versions).IterAll(
                        expand_top_level_buckets=True,
                        bucket_listing_fields=self.bucket_listing_fields))
            plurality = top_level_iterator.HasPlurality()

            try:
                top_level_iterator.PeekException()
            except EncryptionException:
                # Detailed listing on a single object can perform a GetObjectMetadata
                # call, which raises if a matching encryption key isn't found.
                # Re-iterate without requesting encrypted fields.
                top_level_iterator = PluralityCheckableIterator(
                    self._iterator_func(
                        url.CreatePrefixUrl(wildcard_suffix=None),
                        all_versions=self.all_versions).
                    IterAll(
                        expand_top_level_buckets=True,
                        bucket_listing_fields=UNENCRYPTED_FULL_LISTING_FIELDS))
                plurality = top_level_iterator.HasPlurality()

            for blr in top_level_iterator:
                if self._MatchesExcludedPattern(blr):
                    continue
                if blr.IsObject():
                    nd = 0
                    no, nb = self._print_object_func(blr)
                    print_newline = True
                elif blr.IsPrefix():
                    if print_newline:
                        self._print_newline_func()
                    else:
                        print_newline = True
                    if plurality and self.list_subdir_contents:
                        self._print_dir_header_func(blr)
                    elif plurality and not self.list_subdir_contents:
                        print_newline = False
                    expansion_url_str = StorageUrlFromString(
                        blr.url_string).CreatePrefixUrl(
                            wildcard_suffix='*' if self.
                            list_subdir_contents else None)
                    nd, no, nb = self._RecurseExpandUrlAndPrint(
                        expansion_url_str)
                    self._print_dir_summary_func(nb, blr)
                else:
                    # We handle all buckets at the top level, so this should never happen.
                    raise CommandException(
                        'Sub-level iterator returned a CsBucketListingRef of type Bucket'
                    )
                num_objects += no
                num_dirs += nd
                num_bytes += nb
            return num_dirs, num_objects, num_bytes
Example #31
0
class _DiffIterator(object):
    """Iterator yielding sequence of _DiffToApply objects."""

    def __init__(self, command_obj, base_src_url, base_dst_url):
        self.command_obj = command_obj
        self.compute_file_checksums = command_obj.compute_file_checksums
        self.delete_extras = command_obj.delete_extras
        self.recursion_requested = command_obj.recursion_requested
        self.logger = self.command_obj.logger
        self.base_src_url = base_src_url
        self.base_dst_url = base_dst_url
        self.logger.info("Building synchronization state...")

        (src_fh, self.sorted_list_src_file_name) = tempfile.mkstemp(prefix="gsutil-rsync-src-")
        _tmp_files.append(self.sorted_list_src_file_name)
        (dst_fh, self.sorted_list_dst_file_name) = tempfile.mkstemp(prefix="gsutil-rsync-dst-")
        _tmp_files.append(self.sorted_list_dst_file_name)
        # Close the file handles; the file will be opened in write mode by
        # _ListUrlRootFunc.
        os.close(src_fh)
        os.close(dst_fh)

        # Build sorted lists of src and dst URLs in parallel. To do this, pass args
        # to _ListUrlRootFunc as tuple (base_url_str, out_filename, desc)
        # where base_url_str is the starting URL string for listing.
        args_iter = iter(
            [
                (self.base_src_url.url_string, self.sorted_list_src_file_name, "source"),
                (self.base_dst_url.url_string, self.sorted_list_dst_file_name, "destination"),
            ]
        )

        # Contains error message from non-retryable listing failure.
        command_obj.non_retryable_listing_failures = 0
        shared_attrs = ["non_retryable_listing_failures"]
        command_obj.Apply(
            _ListUrlRootFunc,
            args_iter,
            _RootListingExceptionHandler,
            shared_attrs,
            arg_checker=DummyArgChecker,
            parallel_operations_override=True,
            fail_on_error=True,
        )

        if command_obj.non_retryable_listing_failures:
            raise CommandException("Caught non-retryable exception - aborting rsync")

        self.sorted_list_src_file = open(self.sorted_list_src_file_name, "r")
        self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, "r")

        # Wrap iterators in PluralityCheckableIterator so we can check emptiness.
        self.sorted_src_urls_it = PluralityCheckableIterator(iter(self.sorted_list_src_file))
        self.sorted_dst_urls_it = PluralityCheckableIterator(iter(self.sorted_list_dst_file))

    def _ParseTmpFileLine(self, line):
        """Parses output from _BuildTmpOutputLine.

    Parses into tuple:
      (URL, size, crc32c, md5)
    where crc32c and/or md5 can be _NA.

    Args:
      line: The line to parse.

    Returns:
      Parsed tuple: (url, size, crc32c, md5)
    """
        (encoded_url, size, crc32c, md5) = line.split()
        return (_DecodeUrl(encoded_url), int(size), crc32c, md5.strip())

    def _WarnIfMissingCloudHash(self, url_str, crc32c, md5):
        """Warns if given url_str is a cloud URL and is missing both crc32c and md5.

    Args:
      url_str: Destination URL string.
      crc32c: Destination CRC32c.
      md5: Destination MD5.

    Returns:
      True if issued warning.
    """
        # One known way this can currently happen is when rsync'ing objects larger
        # than 5 GB from S3 (for which the etag is not an MD5).
        if StorageUrlFromString(url_str).IsCloudUrl() and crc32c == _NA and md5 == _NA:
            self.logger.warn("Found no hashes to validate %s. Integrity cannot be assured without " "hashes.", url_str)
            return True
        return False

    def _ObjectsMatch(self, src_url_str, src_size, src_crc32c, src_md5, dst_url_str, dst_size, dst_crc32c, dst_md5):
        """Returns True if src and dst objects are the same.

    Uses size plus whatever checksums are available.

    Args:
      src_url_str: Source URL string.
      src_size: Source size
      src_crc32c: Source CRC32c.
      src_md5: Source MD5.
      dst_url_str: Destination URL string.
      dst_size: Destination size
      dst_crc32c: Destination CRC32c.
      dst_md5: Destination MD5.

    Returns:
      True/False.
    """
        # Note: This function is called from __iter__, which is called from the
        # Command.Apply driver. Thus, all checksum computation will be run in a
        # single thread, which is good (having multiple threads concurrently
        # computing checksums would thrash the disk).
        if src_size != dst_size:
            return False
        if self.compute_file_checksums:
            (src_crc32c, src_md5, dst_crc32c, dst_md5) = _ComputeNeededFileChecksums(
                self.logger, src_url_str, src_size, src_crc32c, src_md5, dst_url_str, dst_size, dst_crc32c, dst_md5
            )
        if src_md5 != _NA and dst_md5 != _NA:
            self.logger.debug("Comparing md5 for %s and %s", src_url_str, dst_url_str)
            return src_md5 == dst_md5
        if src_crc32c != _NA and dst_crc32c != _NA:
            self.logger.debug("Comparing crc32c for %s and %s", src_url_str, dst_url_str)
            return src_crc32c == dst_crc32c
        if not self._WarnIfMissingCloudHash(src_url_str, src_crc32c, src_md5):
            self._WarnIfMissingCloudHash(dst_url_str, dst_crc32c, dst_md5)
        # Without checksums to compare we depend only on basic size comparison.
        return True

    def __iter__(self):
        """Iterates over src/dst URLs and produces a _DiffToApply sequence.

    Yields:
      The _DiffToApply.
    """
        # Strip trailing slashes, if any, so we compute tail length against
        # consistent position regardless of whether trailing slashes were included
        # or not in URL.
        base_src_url_len = len(self.base_src_url.url_string.rstrip("/\\"))
        base_dst_url_len = len(self.base_dst_url.url_string.rstrip("/\\"))
        src_url_str = dst_url_str = None
        # Invariant: After each yield, the URLs in src_url_str, dst_url_str,
        # self.sorted_src_urls_it, and self.sorted_dst_urls_it are not yet
        # processed. Each time we encounter None in src_url_str or dst_url_str we
        # populate from the respective iterator, and we reset one or the other value
        # to None after yielding an action that disposes of that URL.
        while not self.sorted_src_urls_it.IsEmpty() or src_url_str is not None:
            if src_url_str is None:
                (src_url_str, src_size, src_crc32c, src_md5) = self._ParseTmpFileLine(self.sorted_src_urls_it.next())
                # Skip past base URL and normalize slashes so we can compare across
                # clouds/file systems (including Windows).
                src_url_str_to_check = _EncodeUrl(src_url_str[base_src_url_len:].replace("\\", "/"))
                dst_url_str_would_copy_to = copy_helper.ConstructDstUrl(
                    self.base_src_url,
                    StorageUrlFromString(src_url_str),
                    True,
                    True,
                    self.base_dst_url,
                    False,
                    self.recursion_requested,
                ).url_string
            if self.sorted_dst_urls_it.IsEmpty():
                # We've reached end of dst URLs, so copy src to dst.
                yield _DiffToApply(src_url_str, dst_url_str_would_copy_to, _DiffAction.COPY)
                src_url_str = None
                continue
            if not dst_url_str:
                (dst_url_str, dst_size, dst_crc32c, dst_md5) = self._ParseTmpFileLine(self.sorted_dst_urls_it.next())
                # Skip past base URL and normalize slashes so we can compare acros
                # clouds/file systems (including Windows).
                dst_url_str_to_check = _EncodeUrl(dst_url_str[base_dst_url_len:].replace("\\", "/"))

            if src_url_str_to_check < dst_url_str_to_check:
                # There's no dst object corresponding to src object, so copy src to dst.
                yield _DiffToApply(src_url_str, dst_url_str_would_copy_to, _DiffAction.COPY)
                src_url_str = None
            elif src_url_str_to_check > dst_url_str_to_check:
                # dst object without a corresponding src object, so remove dst if -d
                # option was specified.
                if self.delete_extras:
                    yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE)
                dst_url_str = None
            else:
                # There is a dst object corresponding to src object, so check if objects
                # match.
                if self._ObjectsMatch(
                    src_url_str, src_size, src_crc32c, src_md5, dst_url_str, dst_size, dst_crc32c, dst_md5
                ):
                    # Continue iterating without yielding a _DiffToApply.
                    pass
                else:
                    yield _DiffToApply(src_url_str, dst_url_str, _DiffAction.COPY)
                src_url_str = None
                dst_url_str = None

        # If -d option specified any files/objects left in dst iteration should be
        # removed.
        if not self.delete_extras:
            return
        if dst_url_str:
            yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE)
            dst_url_str = None
        for line in self.sorted_dst_urls_it:
            (dst_url_str, _, _, _) = self._ParseTmpFileLine(line)
            yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE)
Example #32
0
  def _ExpandUriAndPrintInfo(self, uri, listing_style, should_recurse=False):
    """
    Expands wildcards and directories/buckets for uri as needed, and
    calls _PrintInfoAboutBucketListingRef() on each.

    Args:
      uri: StorageUri being listed.
      listing_style: ListingStyle enum describing type of output desired.
      should_recurse: bool indicator of whether to expand recursively.

    Returns:
      Tuple (number of matching objects, number of bytes across these objects).
    """
    # We do a two-level loop, with the outer loop iterating level-by-level from
    # blrs_to_expand, and the inner loop iterating the matches at the current
    # level, printing them, and adding any new subdirs that need expanding to
    # blrs_to_expand (to be picked up in the next outer loop iteration).
    blrs_to_expand = [BucketListingRef(uri)]
    num_objs = 0
    num_bytes = 0
    expanding_top_level = True
    printed_one = False
    num_expanded_blrs = 0
    while len(blrs_to_expand):
      if printed_one:
        print
      blr = blrs_to_expand.pop(0)
      if blr.HasKey():
        blr_iterator = iter([blr])
      elif blr.HasPrefix():
        # Bucket subdir from a previous iteration. Print "header" line only if
        # we're listing more than one subdir (or if it's a recursive listing),
        # to be consistent with the way UNIX ls works.
        if num_expanded_blrs > 1 or should_recurse:
          print '%s:' % blr.GetUriString().encode('utf-8')
          printed_one = True
        blr_iterator = self.WildcardIterator('%s/*' %
                                             blr.GetRStrippedUriString(),
                                             all_versions=self.all_versions)
      elif blr.NamesBucket():
        blr_iterator = self.WildcardIterator('%s*' % blr.GetUriString(),
                                             all_versions=self.all_versions)
      else:
        # This BLR didn't come from a bucket listing. This case happens for
        # BLR's instantiated from a user-provided URI.
        blr_iterator = PluralityCheckableIterator(
            _UriOnlyBlrExpansionIterator(
                self, blr, all_versions=self.all_versions))
        if blr_iterator.is_empty() and not ContainsWildcard(uri):
          raise CommandException('No such object %s' % uri)
      for cur_blr in blr_iterator:
        num_expanded_blrs = num_expanded_blrs + 1
        if cur_blr.HasKey():
          # Object listing.
          (no, nb) = self._PrintInfoAboutBucketListingRef(
              cur_blr, listing_style)
          num_objs += no
          num_bytes += nb
          printed_one = True
        else:
          # Subdir listing. If we're at the top level of a bucket subdir
          # listing don't print the list here (corresponding to how UNIX ls
          # dir just prints its contents, not the name followed by its
          # contents).
          if (expanding_top_level and not uri.names_bucket()) or should_recurse:
            if cur_blr.GetUriString().endswith('//'):
              # Expand gs://bucket// into gs://bucket//* so we don't infinite
              # loop. This case happens when user has uploaded an object whose
              # name begins with a /.
              cur_blr = BucketListingRef(self.suri_builder.StorageUri(
                  '%s*' % cur_blr.GetUriString()), None, None, cur_blr.headers)
            blrs_to_expand.append(cur_blr)
          # Don't include the subdir name in the output if we're doing a
          # recursive listing, as it will be printed as 'subdir:' when we get
          # to the prefix expansion, the next iteration of the main loop.
          else:
            if listing_style == ListingStyle.LONG:
              print '%-33s%s' % (
                  '', cur_blr.GetUriString().encode('utf-8'))
            else:
              print cur_blr.GetUriString().encode('utf-8')
      expanding_top_level = False
    return (num_objs, num_bytes)
Example #33
0
    def __iter__(self):
        """Iterates over all source URLs passed to the iterator.

    For each src url, expands wildcards, object-less bucket names,
    subdir bucket names, and directory names, and generates a flat listing of
    all the matching objects/files.

    You should instantiate this object using the static factory function
    NameExpansionIterator, because consumers of this iterator need the
    PluralityCheckableIterator wrapper built by that function.

    Yields:
      gslib.name_expansion.NameExpansionResult.

    Raises:
      CommandException: if errors encountered.
    """
        for url_str in self.url_strs:
            storage_url = StorageUrlFromString(url_str)

            if storage_url.IsFileUrl() and storage_url.IsStream():
                if self.url_strs.has_plurality:
                    raise CommandException(
                        'Multiple URL strings are not supported '
                        'with streaming ("-") URLs.')
                yield NameExpansionResult(storage_url, False, False,
                                          storage_url)
                continue

            # Step 1: Expand any explicitly specified wildcards. The output from this
            # step is an iterator of BucketListingRef.
            # Starting with gs://buck*/abc* this step would expand to gs://bucket/abcd

            src_names_bucket = False
            if (storage_url.IsCloudUrl() and storage_url.IsBucket()
                    and not self.recursion_requested):
                # UNIX commands like rm and cp will omit directory references.
                # If url_str refers only to buckets and we are not recursing,
                # then produce references of type BUCKET, because they are guaranteed
                # to pass through Step 2 and be omitted in Step 3.
                post_step1_iter = PluralityCheckableIterator(
                    self.WildcardIterator(url_str).IterBuckets(
                        bucket_fields=['id']))
            else:
                # Get a list of objects and prefixes, expanding the top level for
                # any listed buckets.  If our source is a bucket, however, we need
                # to treat all of the top level expansions as names_container=True.
                post_step1_iter = PluralityCheckableIterator(
                    self.WildcardIterator(url_str).IterAll(
                        bucket_listing_fields=['name'],
                        expand_top_level_buckets=True))
                if storage_url.IsCloudUrl() and storage_url.IsBucket():
                    src_names_bucket = True

            # Step 2: Expand bucket subdirs. The output from this
            # step is an iterator of (names_container, BucketListingRef).
            # Starting with gs://bucket/abcd this step would expand to:
            #   iter([(True, abcd/o1.txt), (True, abcd/o2.txt)]).
            subdir_exp_wildcard = self._flatness_wildcard[
                self.recursion_requested]
            if self.recursion_requested:
                post_step2_iter = _ImplicitBucketSubdirIterator(
                    self, post_step1_iter, subdir_exp_wildcard)
            else:
                post_step2_iter = _NonContainerTuplifyIterator(post_step1_iter)
            post_step2_iter = PluralityCheckableIterator(post_step2_iter)

            # Because we actually perform and check object listings here, this will
            # raise if url_args includes a non-existent object.  However,
            # plurality_checkable_iterator will buffer the exception for us, not
            # raising it until the iterator is actually asked to yield the first
            # result.
            if post_step2_iter.IsEmpty():
                if self.continue_on_error:
                    try:
                        raise CommandException('No URLs matched: %s' % url_str)
                    except CommandException, e:
                        # Yield a specialized tuple of (exception, stack_trace) to
                        # the wrapping PluralityCheckableIterator.
                        yield (e, sys.exc_info()[2])
                else:
                    raise CommandException('No URLs matched: %s' % url_str)

            # Step 3. Omit any directories, buckets, or bucket subdirectories for
            # non-recursive expansions.
            post_step3_iter = PluralityCheckableIterator(
                _OmitNonRecursiveIterator(post_step2_iter,
                                          self.recursion_requested,
                                          self.command_name,
                                          self.cmd_supports_recursion,
                                          self.logger))

            src_url_expands_to_multi = post_step3_iter.HasPlurality()
            is_multi_source_request = (self.url_strs.has_plurality
                                       or src_url_expands_to_multi)

            # Step 4. Expand directories and buckets. This step yields the iterated
            # values. Starting with gs://bucket this step would expand to:
            #  [abcd/o1.txt, abcd/o2.txt, xyz/o1.txt, xyz/o2.txt]
            # Starting with file://dir this step would expand to:
            #  [dir/a.txt, dir/b.txt, dir/c/]
            for (names_container, blr) in post_step3_iter:
                src_names_container = src_names_bucket or names_container

                if blr.IsObject():
                    yield NameExpansionResult(storage_url,
                                              is_multi_source_request,
                                              src_names_container,
                                              blr.storage_url)
                else:
                    # Use implicit wildcarding to do the enumeration.
                    # At this point we are guaranteed that:
                    # - Recursion has been requested because non-object entries are
                    #   filtered in step 3 otherwise.
                    # - This is a prefix or bucket subdirectory because only
                    #   non-recursive iterations product bucket references.
                    expanded_url = StorageUrlFromString(blr.url_string)
                    if expanded_url.IsFileUrl():
                        # Convert dir to implicit recursive wildcard.
                        url_to_iterate = '%s%s%s' % (blr, os.sep,
                                                     subdir_exp_wildcard)
                    else:
                        # Convert subdir to implicit recursive wildcard.
                        url_to_iterate = expanded_url.CreatePrefixUrl(
                            wildcard_suffix=subdir_exp_wildcard)

                    wc_iter = PluralityCheckableIterator(
                        self.WildcardIterator(url_to_iterate).IterObjects(
                            bucket_listing_fields=['name']))
                    src_url_expands_to_multi = (src_url_expands_to_multi
                                                or wc_iter.HasPlurality())
                    is_multi_source_request = (self.url_strs.has_plurality
                                               or src_url_expands_to_multi)
                    # This will be a flattened listing of all underlying objects in the
                    # subdir.
                    for blr in wc_iter:
                        yield NameExpansionResult(storage_url,
                                                  is_multi_source_request,
                                                  True, blr.storage_url)
Example #34
0
  def __iter__(self):
    for uri_str in self.uri_strs:
      # Step 1: Expand any explicitly specified wildcards. The output from this
      # step is an iterator of BucketListingRef.
      # Starting with gs://buck*/abc* this step would expand to gs://bucket/abcd
      if ContainsWildcard(uri_str):
        post_step1_iter = self._WildcardIterator(uri_str)
      else:
        suri = self.suri_builder.StorageUri(uri_str)
        post_step1_iter = iter([BucketListingRef(suri)])
      post_step1_iter = PluralityCheckableIterator(post_step1_iter)

      # Step 2: Expand bucket subdirs and versions. The output from this
      # step is an iterator of (names_container, BucketListingRef).
      # Starting with gs://bucket/abcd this step would expand to:
      #   iter([(True, abcd/o1.txt), (True, abcd/o2.txt)]).
      if self.flat and self.recursion_requested:
        post_step2_iter = _ImplicitBucketSubdirIterator(self,
            post_step1_iter, self.flat)
      elif self.all_versions:
        post_step2_iter = _AllVersionIterator(self, post_step1_iter,
                                              headers=self.headers)
      else:
        post_step2_iter = _NonContainerTuplifyIterator(post_step1_iter)
      post_step2_iter = PluralityCheckableIterator(post_step2_iter)

      # Step 3. Expand directories and buckets. This step yields the iterated
      # values. Starting with gs://bucket this step would expand to:
      #  [abcd/o1.txt, abcd/o2.txt, xyz/o1.txt, xyz/o2.txt]
      # Starting with file://dir this step would expand to:
      #  [dir/a.txt, dir/b.txt, dir/c/]
      exp_src_bucket_listing_refs = []
      wc = self._flatness_wildcard[self.flat]
      src_uri_expands_to_multi = (post_step1_iter.has_plurality()
                                  or post_step2_iter.has_plurality())
      is_multi_src_request = (self.uri_strs.has_plurality
                              or src_uri_expands_to_multi)

      if post_step2_iter.is_empty():
        raise CommandException('No URIs matched: %s' % uri_str)
      for (names_container, blr) in post_step2_iter:
        if (not blr.GetUri().names_container()
            and (self.flat or not blr.HasPrefix())):
          yield NameExpansionResult(uri_str, is_multi_src_request,
                                    src_uri_expands_to_multi, names_container,
                                    blr.GetUriString(),
                                    self.have_existing_dst_container,
                                    is_latest=blr.IsLatest())
          continue
        if not self.recursion_requested:
          if blr.GetUri().is_file_uri():
            desc = 'directory'
          elif blr.GetUri().names_bucket():
            desc = 'bucket'
          else:
            desc = 'bucket subdir'
          if self.cmd_supports_recursion:
            self.logger.info(
                'Omitting %s "%s". (Did you mean to do %s -R?)',
                desc, blr.GetUri(), self.command_name)
          else:
            self.logger.info('Omitting %s "%s".', desc, blr.GetUri())
          continue
        if blr.GetUri().is_file_uri():
          # Convert dir to implicit recursive wildcard.
          uri_to_iterate = '%s/%s' % (blr.GetUriString(), wc)
        else:
          # Convert bucket to implicit recursive wildcard.
          uri_to_iterate = blr.GetUri().clone_replace_name(wc)
        wc_iter = PluralityCheckableIterator(
            self._WildcardIterator(uri_to_iterate))
        src_uri_expands_to_multi = (src_uri_expands_to_multi
                                    or wc_iter.has_plurality())
        is_multi_src_request = (self.uri_strs.has_plurality
                                or src_uri_expands_to_multi)
        for blr in wc_iter:
          yield NameExpansionResult(uri_str, is_multi_src_request,
                                    src_uri_expands_to_multi, True,
                                    blr.GetUriString(),
                                    self.have_existing_dst_container,
                                    is_latest=blr.IsLatest())
Example #35
0
    def __iter__(self):
        for uri_str in self.uri_strs:
            # Step 1: Expand any explicitly specified wildcards. The output from this
            # step is an iterator of BucketListingRef.
            # Starting with gs://buck*/abc* this step would expand to gs://bucket/abcd
            if ContainsWildcard(uri_str):
                post_step1_iter = self._WildcardIterator(uri_str)
            else:
                suri = self.suri_builder.StorageUri(uri_str)
                post_step1_iter = iter([BucketListingRef(suri)])
            post_step1_iter = PluralityCheckableIterator(post_step1_iter)

            # Step 2: Expand bucket subdirs and versions. The output from this
            # step is an iterator of (names_container, BucketListingRef).
            # Starting with gs://bucket/abcd this step would expand to:
            #   iter([(True, abcd/o1.txt), (True, abcd/o2.txt)]).
            if self.flat and self.recursion_requested:
                post_step2_iter = _ImplicitBucketSubdirIterator(
                    self, post_step1_iter, self.flat)
            elif self.all_versions:
                post_step2_iter = _AllVersionIterator(self,
                                                      post_step1_iter,
                                                      headers=self.headers)
            else:
                post_step2_iter = _NonContainerTuplifyIterator(post_step1_iter)
            post_step2_iter = PluralityCheckableIterator(post_step2_iter)

            # Step 3. Expand directories and buckets. This step yields the iterated
            # values. Starting with gs://bucket this step would expand to:
            #  [abcd/o1.txt, abcd/o2.txt, xyz/o1.txt, xyz/o2.txt]
            # Starting with file://dir this step would expand to:
            #  [dir/a.txt, dir/b.txt, dir/c/]
            exp_src_bucket_listing_refs = []
            wc = self._flatness_wildcard[self.flat]
            src_uri_expands_to_multi = (post_step1_iter.has_plurality()
                                        or post_step2_iter.has_plurality())
            is_multi_src_request = (self.uri_strs.has_plurality()
                                    or src_uri_expands_to_multi)

            if post_step2_iter.is_empty():
                raise CommandException('No URIs matched: %s' % uri_str)
            for (names_container, blr) in post_step2_iter:
                if (not blr.GetUri().names_container()
                        and (self.flat or not blr.HasPrefix())):
                    yield NameExpansionResult(uri_str,
                                              is_multi_src_request,
                                              src_uri_expands_to_multi,
                                              names_container,
                                              blr.GetUriString(),
                                              self.have_existing_dst_container,
                                              is_latest=blr.IsLatest())
                    continue
                if not self.recursion_requested:
                    if blr.GetUri().is_file_uri():
                        desc = 'directory'
                    else:
                        desc = 'bucket'
                    print 'Omitting %s "%s". (Did you mean to do %s -R?)' % (
                        desc, blr.GetUri(), self.command_name)
                    continue
                if blr.GetUri().is_file_uri():
                    # Convert dir to implicit recursive wildcard.
                    uri_to_iterate = '%s/%s' % (blr.GetUriString(), wc)
                else:
                    # Convert bucket to implicit recursive wildcard.
                    uri_to_iterate = blr.GetUri().clone_replace_name(wc)
                wc_iter = PluralityCheckableIterator(
                    self._WildcardIterator(uri_to_iterate))
                src_uri_expands_to_multi = (src_uri_expands_to_multi
                                            or wc_iter.has_plurality())
                is_multi_src_request = (self.uri_strs.has_plurality()
                                        or src_uri_expands_to_multi)
                for blr in wc_iter:
                    yield NameExpansionResult(uri_str,
                                              is_multi_src_request,
                                              src_uri_expands_to_multi,
                                              True,
                                              blr.GetUriString(),
                                              self.have_existing_dst_container,
                                              is_latest=blr.IsLatest())
Example #36
0
def NameExpansionIterator(command_name, proj_id_handler, headers, debug,
                          logger, bucket_storage_uri_class, uri_strs,
                          recursion_requested,
                          have_existing_dst_container=None, flat=True,
                          all_versions=False,
                          for_all_version_delete=False,
                          cmd_supports_recursion=True):
  """
  Static factory function for instantiating _NameExpansionIterator, which
  wraps the resulting iterator in a PluralityCheckableIterator and checks
  that it is non-empty. Also, allows uri_strs can be either an array or an
  iterator.

  Args:
    command_name: name of command being run.
    proj_id_handler: ProjectIdHandler to use for current command.
    headers: Dictionary containing optional HTTP headers to pass to boto.
    debug: Debug level to pass in to boto connection (range 0..3).
    logger: logging.Logger object.
    bucket_storage_uri_class: Class to instantiate for cloud StorageUris.
        Settable for testing/mocking.
    uri_strs: PluralityCheckableIterator of URI strings needing expansion.
    recursion_requested: True if -R specified on command-line.
    have_existing_dst_container: Bool indicator whether this is a copy
        request to an existing bucket, bucket subdir, or directory. Default
        None value should be used in cases where this is not needed (commands
        other than cp).
    flat: Bool indicating whether bucket listings should be flattened, i.e.,
        so the mapped-to results contain objects spanning subdirectories.
    all_versions: Bool indicating whether to iterate over all object versions.
    for_all_version_delete: Bool indicating whether this is for an all-version
        delete.
    cmd_supports_recursion: Bool indicating whether this command supports a '-R'
        flag. Useful for printing helpful error messages.

  Examples of ExpandWildcardsAndContainers with flat=True:
    - Calling with one of the uri_strs being 'gs://bucket' will enumerate all
      top-level objects, as will 'gs://bucket/' and 'gs://bucket/*'.
    - 'gs://bucket/**' will enumerate all objects in the bucket.
    - 'gs://bucket/abc' will enumerate all next-level objects under directory
      abc (i.e., not including subdirectories of abc) if gs://bucket/abc/*
      matches any objects; otherwise it will enumerate the single name
      gs://bucket/abc
    - 'gs://bucket/abc/**' will enumerate all objects under abc or any of its
      subdirectories.
    - 'file:///tmp' will enumerate all files under /tmp, as will
      'file:///tmp/*'
    - 'file:///tmp/**' will enumerate all files under /tmp or any of its
      subdirectories.

  Example if flat=False: calling with gs://bucket/abc/* lists matching objects
  or subdirs, but not sub-subdirs or objects beneath subdirs.

  Note: In step-by-step comments below we give examples assuming there's a
  gs://bucket with object paths:
    abcd/o1.txt
    abcd/o2.txt
    xyz/o1.txt
    xyz/o2.txt
  and a directory file://dir with file paths:
    dir/a.txt
    dir/b.txt
    dir/c/
  """
  uri_strs = PluralityCheckableIterator(uri_strs)
  name_expansion_iterator = _NameExpansionIterator(
      command_name, proj_id_handler, headers, debug, logger,
      bucket_storage_uri_class, uri_strs, recursion_requested,
      have_existing_dst_container, flat, all_versions=all_versions,
      for_all_version_delete=for_all_version_delete,
      cmd_supports_recursion=cmd_supports_recursion)
  name_expansion_iterator = PluralityCheckableIterator(name_expansion_iterator)
  if name_expansion_iterator.is_empty():
    raise CommandException('No URIs matched')
  return name_expansion_iterator
Example #37
0
class _DiffIterator(object):
  """Iterator yielding sequence of _DiffToApply objects."""

  def __init__(self, command_obj, base_src_url, base_dst_url):
    self.command_obj = command_obj
    self.compute_file_checksums = command_obj.compute_file_checksums
    self.delete_extras = command_obj.delete_extras
    self.recursion_requested = command_obj.recursion_requested
    self.logger = self.command_obj.logger
    self.base_src_url = base_src_url
    self.base_dst_url = base_dst_url
    self.logger.info('Building synchronization state...')

    (src_fh, self.sorted_list_src_file_name) = tempfile.mkstemp(
        prefix='gsutil-rsync-src-')
    _tmp_files.append(self.sorted_list_src_file_name)
    (dst_fh, self.sorted_list_dst_file_name) = tempfile.mkstemp(
        prefix='gsutil-rsync-dst-')
    _tmp_files.append(self.sorted_list_dst_file_name)
    # Close the file handles; the file will be opened in write mode by
    # _ListUrlRootFunc.
    os.close(src_fh)
    os.close(dst_fh)

    # Build sorted lists of src and dst URLs in parallel. To do this, pass args
    # to _ListUrlRootFunc as tuple (base_url_str, out_filename, desc)
    # where base_url_str is the starting URL string for listing.
    args_iter = iter([
        (self.base_src_url.url_string, self.sorted_list_src_file_name,
         'source'),
        (self.base_dst_url.url_string, self.sorted_list_dst_file_name,
         'destination')
    ])

    # Contains error message from non-retryable listing failure.
    command_obj.non_retryable_listing_failures = 0
    shared_attrs = ['non_retryable_listing_failures']
    command_obj.Apply(_ListUrlRootFunc, args_iter, _RootListingExceptionHandler,
                      shared_attrs, arg_checker=DummyArgChecker,
                      parallel_operations_override=True,
                      fail_on_error=True)

    if command_obj.non_retryable_listing_failures:
      raise CommandException('Caught non-retryable exception - aborting rsync')

    self.sorted_list_src_file = open(self.sorted_list_src_file_name, 'r')
    self.sorted_list_dst_file = open(self.sorted_list_dst_file_name, 'r')

    # Wrap iterators in PluralityCheckableIterator so we can check emptiness.
    self.sorted_src_urls_it = PluralityCheckableIterator(
        iter(self.sorted_list_src_file))
    self.sorted_dst_urls_it = PluralityCheckableIterator(
        iter(self.sorted_list_dst_file))

  def _ParseTmpFileLine(self, line):
    """Parses output from _BuildTmpOutputLine.

    Parses into tuple:
      (URL, size, crc32c, md5)
    where crc32c and/or md5 can be _NA.

    Args:
      line: The line to parse.

    Returns:
      Parsed tuple: (url, size, crc32c, md5)
    """
    (encoded_url, size, crc32c, md5) = line.split()
    return (_DecodeUrl(encoded_url), int(size), crc32c, md5.strip())

  def _WarnIfMissingCloudHash(self, url_str, crc32c, md5):
    """Warns if given url_str is a cloud URL and is missing both crc32c and md5.

    Args:
      url_str: Destination URL string.
      crc32c: Destination CRC32c.
      md5: Destination MD5.

    Returns:
      True if issued warning.
    """
    # One known way this can currently happen is when rsync'ing objects larger
    # than 5 GB from S3 (for which the etag is not an MD5).
    if (StorageUrlFromString(url_str).IsCloudUrl()
        and crc32c == _NA and md5 == _NA):
      self.logger.warn(
          'Found no hashes to validate %s. Integrity cannot be assured without '
          'hashes.', url_str)
      return True
    return False

  def _ObjectsMatch(self, src_url_str, src_size, src_crc32c, src_md5,
                    dst_url_str, dst_size, dst_crc32c, dst_md5):
    """Returns True if src and dst objects are the same.

    Uses size plus whatever checksums are available.

    Args:
      src_url_str: Source URL string.
      src_size: Source size
      src_crc32c: Source CRC32c.
      src_md5: Source MD5.
      dst_url_str: Destination URL string.
      dst_size: Destination size
      dst_crc32c: Destination CRC32c.
      dst_md5: Destination MD5.

    Returns:
      True/False.
    """
    # Note: This function is called from __iter__, which is called from the
    # Command.Apply driver. Thus, all checksum computation will be run in a
    # single thread, which is good (having multiple threads concurrently
    # computing checksums would thrash the disk).
    if src_size != dst_size:
      return False
    if self.compute_file_checksums:
      (src_crc32c, src_md5, dst_crc32c, dst_md5) = _ComputeNeededFileChecksums(
          self.logger, src_url_str, src_size, src_crc32c, src_md5, dst_url_str,
          dst_size, dst_crc32c, dst_md5)
    if src_md5 != _NA and dst_md5 != _NA:
      self.logger.debug('Comparing md5 for %s and %s', src_url_str, dst_url_str)
      return src_md5 == dst_md5
    if src_crc32c != _NA and dst_crc32c != _NA:
      self.logger.debug(
          'Comparing crc32c for %s and %s', src_url_str, dst_url_str)
      return src_crc32c == dst_crc32c
    if not self._WarnIfMissingCloudHash(src_url_str, src_crc32c, src_md5):
      self._WarnIfMissingCloudHash(dst_url_str, dst_crc32c, dst_md5)
    # Without checksums to compare we depend only on basic size comparison.
    return True

  def __iter__(self):
    """Iterates over src/dst URLs and produces a _DiffToApply sequence.

    Yields:
      The _DiffToApply.
    """
    # Strip trailing slashes, if any, so we compute tail length against
    # consistent position regardless of whether trailing slashes were included
    # or not in URL.
    base_src_url_len = len(self.base_src_url.url_string.rstrip('/\\'))
    base_dst_url_len = len(self.base_dst_url.url_string.rstrip('/\\'))
    src_url_str = dst_url_str = None
    # Invariant: After each yield, the URLs in src_url_str, dst_url_str,
    # self.sorted_src_urls_it, and self.sorted_dst_urls_it are not yet
    # processed. Each time we encounter None in src_url_str or dst_url_str we
    # populate from the respective iterator, and we reset one or the other value
    # to None after yielding an action that disposes of that URL.
    while not self.sorted_src_urls_it.IsEmpty() or src_url_str is not None:
      if src_url_str is None:
        (src_url_str, src_size, src_crc32c, src_md5) = self._ParseTmpFileLine(
            self.sorted_src_urls_it.next())
        # Skip past base URL and normalize slashes so we can compare across
        # clouds/file systems (including Windows).
        src_url_str_to_check = _EncodeUrl(
            src_url_str[base_src_url_len:].replace('\\', '/'))
        dst_url_str_would_copy_to = copy_helper.ConstructDstUrl(
            self.base_src_url, StorageUrlFromString(src_url_str), True, True,
            self.base_dst_url, False, self.recursion_requested).url_string
      if self.sorted_dst_urls_it.IsEmpty():
        # We've reached end of dst URLs, so copy src to dst.
        yield _DiffToApply(
            src_url_str, dst_url_str_would_copy_to, _DiffAction.COPY)
        src_url_str = None
        continue
      if not dst_url_str:
        (dst_url_str, dst_size, dst_crc32c, dst_md5) = (
            self._ParseTmpFileLine(self.sorted_dst_urls_it.next()))
        # Skip past base URL and normalize slashes so we can compare acros
        # clouds/file systems (including Windows).
        dst_url_str_to_check = _EncodeUrl(
            dst_url_str[base_dst_url_len:].replace('\\', '/'))

      if src_url_str_to_check < dst_url_str_to_check:
        # There's no dst object corresponding to src object, so copy src to dst.
        yield _DiffToApply(
            src_url_str, dst_url_str_would_copy_to, _DiffAction.COPY)
        src_url_str = None
      elif src_url_str_to_check > dst_url_str_to_check:
        # dst object without a corresponding src object, so remove dst if -d
        # option was specified.
        if self.delete_extras:
          yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE)
        dst_url_str = None
      else:
        # There is a dst object corresponding to src object, so check if objects
        # match.
        if self._ObjectsMatch(
            src_url_str, src_size, src_crc32c, src_md5,
            dst_url_str, dst_size, dst_crc32c, dst_md5):
          # Continue iterating without yielding a _DiffToApply.
          pass
        else:
          yield _DiffToApply(src_url_str, dst_url_str, _DiffAction.COPY)
        src_url_str = None
        dst_url_str = None

    # If -d option specified any files/objects left in dst iteration should be
    # removed.
    if not self.delete_extras:
      return
    if dst_url_str:
      yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE)
      dst_url_str = None
    for line in self.sorted_dst_urls_it:
      (dst_url_str, _, _, _) = self._ParseTmpFileLine(line)
      yield _DiffToApply(None, dst_url_str, _DiffAction.REMOVE)
        class IterTest(object):
            def __init__(self):
                self.position = 0

            def __iter__(self):
                return self

            def next(self):
                if self.position == 0:
                    try:
                        self.position += 1
                        raise CustomTestException('Test exception 0')
                    except CustomTestException, e:
                        return (e, sys.exc_info()[2])
                elif self.position == 1:
                    self.position += 1
                    return 1
                else:
                    raise StopIteration()

        pcit = PluralityCheckableIterator(IterTest())
        try:
            for _ in pcit:
                pass
            self.fail('Expected exception 0 from iterator')
        except CustomTestException, e:
            self.assertIn(e.message, 'Test exception 0')
        for value in pcit:
            iterated_value = value
        self.assertEqual(iterated_value, 1)