Beispiel #1
0
    def list_prefix(self, path):
        """Lists files matching the prefix.

    Args:
      path: GCS file path pattern in the form gs://<bucket>/[name].

    Returns:
      Dictionary of file name -> size.
    """
        bucket, prefix = parse_gcs_path(path, object_optional=True)
        request = storage.StorageObjectsListRequest(bucket=bucket,
                                                    prefix=prefix)
        file_sizes = {}
        counter = 0
        start_time = time.time()
        logging.info("Starting the size estimation of the input")
        while True:
            response = self.client.objects.List(request)
            for item in response.items:
                file_name = 'gs://%s/%s' % (item.bucket, item.name)
                file_sizes[file_name] = item.size
                counter += 1
                if counter % 10000 == 0:
                    logging.info("Finished computing size of: %s files",
                                 len(file_sizes))
            if response.nextPageToken:
                request.pageToken = response.nextPageToken
            else:
                break
        logging.info("Finished listing %s files in %s seconds.", counter,
                     time.time() - start_time)
        return file_sizes
Beispiel #2
0
  def size_of_files_in_glob(self, pattern, limit=None):
    """Returns the size of all the files in the glob as a dictionary

    Args:
      pattern: a file path pattern that reads the size of all the files
    """
    bucket, name_pattern = parse_gcs_path(pattern)
    # Get the prefix with which we can list objects in the given bucket.
    prefix = re.match('^[^[*?]*', name_pattern).group(0)
    request = storage.StorageObjectsListRequest(bucket=bucket, prefix=prefix)
    file_sizes = {}
    counter = 0
    start_time = time.time()
    logging.info("Starting the size estimation of the input")
    while True:
      response = self.client.objects.List(request)
      for item in response.items:
        if fnmatch.fnmatch(item.name, name_pattern):
          file_name = 'gs://%s/%s' % (item.bucket, item.name)
          file_sizes[file_name] = item.size
          counter += 1
        if limit is not None and counter >= limit:
          break
        if counter % 10000 == 0:
          logging.info("Finished computing size of: %s files", len(file_sizes))
      if response.nextPageToken:
        request.pageToken = response.nextPageToken
        if limit is not None and len(file_sizes) >= limit:
          break
      else:
        break
    logging.info(
        "Finished the size estimation of the input at %s files. " +\
        "Estimation took %s seconds", counter, time.time() - start_time)
    return file_sizes
Beispiel #3
0
  def glob(self, pattern, limit=None):
    """Return the GCS path names matching a given path name pattern.

    Path name patterns are those recognized by fnmatch.fnmatch().  The path
    can contain glob characters (*, ?, and [...] sets).

    Args:
      pattern: GCS file path pattern in the form gs://<bucket>/<name_pattern>.
      limit: Maximal number of path names to return.
        All matching paths are returned if set to None.

    Returns:
      list of GCS file paths matching the given pattern.
    """
    bucket, name_pattern = parse_gcs_path(pattern)
    # Get the prefix with which we can list objects in the given bucket.
    prefix = re.match('^[^[*?]*', name_pattern).group(0)
    request = storage.StorageObjectsListRequest(bucket=bucket, prefix=prefix)
    object_paths = []
    while True:
      response = self.client.objects.List(request)
      for item in response.items:
        if fnmatch.fnmatch(item.name, name_pattern):
          object_paths.append('gs://%s/%s' % (item.bucket, item.name))
      if response.nextPageToken:
        request.pageToken = response.nextPageToken
        if limit is not None and len(object_paths) >= limit:
          break
      else:
        break
    return object_paths[:limit]
Beispiel #4
0
    def list_prefix(self, path, with_metadata=False):
        """Lists files matching the prefix.

    Args:
      path: GCS file path pattern in the form gs://<bucket>/[name].
      with_metadata: Experimental. Specify whether returns file metadata.

    Returns:
      If ``with_metadata`` is False: dict of file name -> size; if
        ``with_metadata`` is True: dict of file name -> tuple(size, timestamp).
    """
        bucket, prefix = parse_gcs_path(path, object_optional=True)
        request = storage.StorageObjectsListRequest(bucket=bucket,
                                                    prefix=prefix)
        file_info = {}
        counter = 0
        start_time = time.time()
        if with_metadata:
            _LOGGER.info("Starting the file information of the input")
        else:
            _LOGGER.info("Starting the size estimation of the input")
        while True:
            response = self.client.objects.List(request)
            for item in response.items:
                file_name = 'gs://%s/%s' % (item.bucket, item.name)
                if with_metadata:
                    file_info[file_name] = (item.size,
                                            self._updated_to_seconds(
                                                item.updated))
                else:
                    file_info[file_name] = item.size
                counter += 1
                if counter % 10000 == 0:
                    if with_metadata:
                        _LOGGER.info(
                            "Finished computing file information of: %s files",
                            len(file_info))
                    else:
                        _LOGGER.info("Finished computing size of: %s files",
                                     len(file_info))
            if response.nextPageToken:
                request.pageToken = response.nextPageToken
            else:
                break
        _LOGGER.info("Finished listing %s files in %s seconds.", counter,
                     time.time() - start_time)
        return file_info
Beispiel #5
0
  def size_of_files_in_glob(self, pattern):
    """Returns the size of all the files in the glob as a dictionary

    Args:
      path: a file path pattern that reads the size of all the files
    """
    bucket, name_pattern = parse_gcs_path(pattern)
    # Get the prefix with which we can list objects in the given bucket.
    prefix = re.match('^[^[*?]*', name_pattern).group(0)
    request = storage.StorageObjectsListRequest(bucket=bucket, prefix=prefix)
    file_sizes = {}
    while True:
      response = self.client.objects.List(request)
      for item in response.items:
        if fnmatch.fnmatch(item.name, name_pattern):
          file_name = 'gs://%s/%s' % (item.bucket, item.name)
          file_sizes[file_name] = item.size
      if response.nextPageToken:
        request.pageToken = response.nextPageToken
      else:
        break
    return file_sizes