def list_prefix(self, path): """Lists files matching the prefix. Args: path: GCS file path pattern in the form gs://<bucket>/[name]. Returns: Dictionary of file name -> size. """ bucket, prefix = parse_gcs_path(path, object_optional=True) request = storage.StorageObjectsListRequest(bucket=bucket, prefix=prefix) file_sizes = {} counter = 0 start_time = time.time() logging.info("Starting the size estimation of the input") while True: response = self.client.objects.List(request) for item in response.items: file_name = 'gs://%s/%s' % (item.bucket, item.name) file_sizes[file_name] = item.size counter += 1 if counter % 10000 == 0: logging.info("Finished computing size of: %s files", len(file_sizes)) if response.nextPageToken: request.pageToken = response.nextPageToken else: break logging.info("Finished listing %s files in %s seconds.", counter, time.time() - start_time) return file_sizes
def size_of_files_in_glob(self, pattern, limit=None): """Returns the size of all the files in the glob as a dictionary Args: pattern: a file path pattern that reads the size of all the files """ bucket, name_pattern = parse_gcs_path(pattern) # Get the prefix with which we can list objects in the given bucket. prefix = re.match('^[^[*?]*', name_pattern).group(0) request = storage.StorageObjectsListRequest(bucket=bucket, prefix=prefix) file_sizes = {} counter = 0 start_time = time.time() logging.info("Starting the size estimation of the input") while True: response = self.client.objects.List(request) for item in response.items: if fnmatch.fnmatch(item.name, name_pattern): file_name = 'gs://%s/%s' % (item.bucket, item.name) file_sizes[file_name] = item.size counter += 1 if limit is not None and counter >= limit: break if counter % 10000 == 0: logging.info("Finished computing size of: %s files", len(file_sizes)) if response.nextPageToken: request.pageToken = response.nextPageToken if limit is not None and len(file_sizes) >= limit: break else: break logging.info( "Finished the size estimation of the input at %s files. " +\ "Estimation took %s seconds", counter, time.time() - start_time) return file_sizes
def glob(self, pattern, limit=None): """Return the GCS path names matching a given path name pattern. Path name patterns are those recognized by fnmatch.fnmatch(). The path can contain glob characters (*, ?, and [...] sets). Args: pattern: GCS file path pattern in the form gs://<bucket>/<name_pattern>. limit: Maximal number of path names to return. All matching paths are returned if set to None. Returns: list of GCS file paths matching the given pattern. """ bucket, name_pattern = parse_gcs_path(pattern) # Get the prefix with which we can list objects in the given bucket. prefix = re.match('^[^[*?]*', name_pattern).group(0) request = storage.StorageObjectsListRequest(bucket=bucket, prefix=prefix) object_paths = [] while True: response = self.client.objects.List(request) for item in response.items: if fnmatch.fnmatch(item.name, name_pattern): object_paths.append('gs://%s/%s' % (item.bucket, item.name)) if response.nextPageToken: request.pageToken = response.nextPageToken if limit is not None and len(object_paths) >= limit: break else: break return object_paths[:limit]
def list_prefix(self, path, with_metadata=False): """Lists files matching the prefix. Args: path: GCS file path pattern in the form gs://<bucket>/[name]. with_metadata: Experimental. Specify whether returns file metadata. Returns: If ``with_metadata`` is False: dict of file name -> size; if ``with_metadata`` is True: dict of file name -> tuple(size, timestamp). """ bucket, prefix = parse_gcs_path(path, object_optional=True) request = storage.StorageObjectsListRequest(bucket=bucket, prefix=prefix) file_info = {} counter = 0 start_time = time.time() if with_metadata: _LOGGER.info("Starting the file information of the input") else: _LOGGER.info("Starting the size estimation of the input") while True: response = self.client.objects.List(request) for item in response.items: file_name = 'gs://%s/%s' % (item.bucket, item.name) if with_metadata: file_info[file_name] = (item.size, self._updated_to_seconds( item.updated)) else: file_info[file_name] = item.size counter += 1 if counter % 10000 == 0: if with_metadata: _LOGGER.info( "Finished computing file information of: %s files", len(file_info)) else: _LOGGER.info("Finished computing size of: %s files", len(file_info)) if response.nextPageToken: request.pageToken = response.nextPageToken else: break _LOGGER.info("Finished listing %s files in %s seconds.", counter, time.time() - start_time) return file_info
def size_of_files_in_glob(self, pattern): """Returns the size of all the files in the glob as a dictionary Args: path: a file path pattern that reads the size of all the files """ bucket, name_pattern = parse_gcs_path(pattern) # Get the prefix with which we can list objects in the given bucket. prefix = re.match('^[^[*?]*', name_pattern).group(0) request = storage.StorageObjectsListRequest(bucket=bucket, prefix=prefix) file_sizes = {} while True: response = self.client.objects.List(request) for item in response.items: if fnmatch.fnmatch(item.name, name_pattern): file_name = 'gs://%s/%s' % (item.bucket, item.name) file_sizes[file_name] = item.size if response.nextPageToken: request.pageToken = response.nextPageToken else: break return file_sizes